@@ -1147,6 +1147,8 @@ struct ext4_inode_info {
*/
struct list_head i_rsv_conversion_list;
struct work_struct i_rsv_conversion_work;
+ struct list_head i_iomap_ioend_list;
+ struct work_struct i_iomap_ioend_work;
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -3755,6 +3757,8 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
size_t len);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
+extern void ext4_iomap_end_io(struct work_struct *work);
+extern void ext4_iomap_end_bio(struct bio *bio);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
@@ -11,6 +11,12 @@ int ext4_inode_journal_mode(struct inode *inode)
{
if (EXT4_JOURNAL(inode) == NULL)
return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ /*
+ * Ordered mode is no longer needed for the inode that use the
+ * iomap path, always use writeback mode.
+ */
+ if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
/* We do not support data journalling with delayed allocation */
if (!S_ISREG(inode->i_mode) ||
ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
@@ -3708,18 +3708,23 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)ee_block, ee_len);
- /* If extent is larger than requested it is a clear sign that we still
- * have some extent state machine issues left. So extent_split is still
- * required.
- * TODO: Once all related issues will be fixed this situation should be
- * illegal.
+ /*
+ * For the inodes that use the buffered iomap path need to split
+ * extents in endio, other inodes not.
+ *
+ * TODO: Reserve enough sapce for splitting extents, always split
+ * extents here, and totally remove this warning.
*/
if (ee_block != map->m_lblk || ee_len > map->m_len) {
#ifdef CONFIG_EXT4_DEBUG
- ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
- " len %u; IO logical block %llu, len %u",
- inode->i_ino, (unsigned long long)ee_block, ee_len,
- (unsigned long long)map->m_lblk, map->m_len);
+ if (!ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) {
+ ext4_warning(inode->i_sb,
+ "Inode (%ld) finished: extent logical block %llu, "
+ "len %u; IO logical block %llu, len %u",
+ inode->i_ino, (unsigned long long)ee_block,
+ ee_len, (unsigned long long)map->m_lblk,
+ map->m_len);
+ }
#endif
err = ext4_split_convert_extents(handle, inode, map, ppath,
EXT4_GET_BLOCKS_CONVERT |
@@ -43,6 +43,7 @@
#include <linux/iversion.h>
#include "ext4_jbd2.h"
+#include "ext4_extents.h"
#include "xattr.h"
#include "acl.h"
#include "truncate.h"
@@ -3705,10 +3706,197 @@ static void ext4_iomap_readahead(struct readahead_control *rac)
iomap_readahead(rac, &ext4_iomap_buffered_read_ops);
}
+struct ext4_writeback_ctx {
+ struct iomap_writepage_ctx ctx;
+ struct writeback_control *wbc;
+ unsigned int data_seq;
+};
+
+static int ext4_iomap_map_one_extent(struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ struct extent_status es;
+ handle_t *handle = NULL;
+ int credits, map_flags;
+ int retval;
+
+ credits = ext4_da_writepages_trans_blocks(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ map->m_flags = 0;
+ /*
+ * In order to protect from the race of truncate, we have to lookup
+ * extent stats and map blocks under i_data_sem, otherwise the
+ * delalloc extent could be stale.
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (likely(ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es))) {
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
+ map->m_len = min_t(unsigned int, retval, map->m_len);
+
+ if (likely(ext4_es_is_delonly(&es))) {
+ trace_ext4_da_write_pages_extent(inode, map);
+ /*
+ * Call ext4_map_create_blocks() to allocate any delayed
+ * allocation blocks. It is possible that we're going to
+ * need more metadata blocks, however we must not fail
+ * because we're in writeback and there is nothing we
+ * can do so it might result in data loss. So use
+ * reserved blocks to allocate metadata if possible.
+ *
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE
+ * indicates that the blocks and quotas has already been
+ * checked when the data was copied into the page cache.
+ */
+ map_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL |
+ EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+ retval = ext4_map_create_blocks(handle, inode, map,
+ map_flags);
+ }
+ if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+ map->m_pblk = ext4_es_pblock(&es) + map->m_lblk -
+ es.es_lblk;
+ map->m_flags = ext4_es_is_written(&es) ?
+ EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+ }
+ } else {
+ retval = ext4_map_query_blocks(handle, inode, map, 0);
+ }
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_journal_stop(handle);
+ return retval < 0 ? retval : 0;
+}
+
+static int ext4_iomap_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct inode *inode, loff_t offset,
+ unsigned int dirty_len)
+{
+ struct ext4_writeback_ctx *ewpc =
+ container_of(wpc, struct ext4_writeback_ctx, ctx);
+ struct super_block *sb = inode->i_sb;
+ struct journal_s *journal = EXT4_SB(sb)->s_journal;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int index = offset >> blkbits;
+ unsigned int end, len;
+ int ret;
+
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
+ /* Check validity of the cached writeback mapping. */
+ if (offset >= wpc->iomap.offset &&
+ offset < wpc->iomap.offset + wpc->iomap.length &&
+ ewpc->data_seq == READ_ONCE(ei->i_es_seq))
+ return 0;
+
+ end = min_t(unsigned int,
+ (ewpc->wbc->range_end >> blkbits), (UINT_MAX - 1));
+ len = (end > index + dirty_len) ? end - index + 1 : dirty_len;
+
+retry:
+ map.m_lblk = index;
+ map.m_len = min_t(unsigned int, EXT_UNWRITTEN_MAX_LEN, len);
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * The map isn't a delalloc extent, it must be a hole or have
+ * already been allocated.
+ */
+ if (!(map.m_flags & EXT4_MAP_DELAYED))
+ goto out;
+
+ /* Map one delalloc extent. */
+ ret = ext4_iomap_map_one_extent(inode, &map);
+ if (ret < 0) {
+ if (ext4_forced_shutdown(sb))
+ return ret;
+
+ /*
+ * Retry transient ENOSPC errors, if
+ * ext4_count_free_blocks() is non-zero, a commit
+ * should free up blocks.
+ */
+ if (ret == -ENOSPC && ext4_count_free_clusters(sb)) {
+ jbd2_journal_force_commit_nested(journal);
+ goto retry;
+ }
+
+ ext4_msg(sb, KERN_CRIT,
+ "Delayed block allocation failed for "
+ "inode %lu at logical offset %llu with "
+ "max blocks %u with error %d",
+ inode->i_ino, (unsigned long long)map.m_lblk,
+ (unsigned int)map.m_len, -ret);
+ ext4_msg(sb, KERN_CRIT,
+ "This should not happen!! Data will "
+ "be lost\n");
+ if (ret == -ENOSPC)
+ ext4_print_free_blocks(inode);
+ return ret;
+ }
+out:
+ ewpc->data_seq = READ_ONCE(ei->i_es_seq);
+ ext4_set_iomap(inode, &wpc->iomap, &map, offset,
+ map.m_len << blkbits, 0);
+ return 0;
+}
+
+static int ext4_iomap_prepare_ioend(struct iomap_ioend *ioend, int status)
+{
+ struct ext4_inode_info *ei = EXT4_I(ioend->io_inode);
+
+ /* Need to convert unwritten extents when I/Os are completed. */
+ if (ioend->io_type == IOMAP_UNWRITTEN ||
+ ioend->io_offset + ioend->io_size > READ_ONCE(ei->i_disksize))
+ ioend->io_bio.bi_end_io = ext4_iomap_end_bio;
+
+ return status;
+}
+
+static void ext4_iomap_discard_folio(struct folio *folio, loff_t pos)
+{
+ struct inode *inode = folio->mapping->host;
+
+ ext4_iomap_punch_delalloc(inode, pos,
+ folio_pos(folio) + folio_size(folio) - pos);
+}
+
+static const struct iomap_writeback_ops ext4_writeback_ops = {
+ .map_blocks = ext4_iomap_map_blocks,
+ .prepare_ioend = ext4_iomap_prepare_ioend,
+ .discard_folio = ext4_iomap_discard_folio,
+};
+
static int ext4_iomap_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- return 0;
+ struct inode *inode = mapping->host;
+ struct super_block *sb = inode->i_sb;
+ long nr = wbc->nr_to_write;
+ int alloc_ctx, ret;
+ struct ext4_writeback_ctx ewpc = {
+ .wbc = wbc,
+ };
+
+ if (unlikely(ext4_forced_shutdown(sb)))
+ return -EIO;
+
+ alloc_ctx = ext4_writepages_down_read(sb);
+ trace_ext4_writepages(inode, wbc);
+ ret = iomap_writepages(mapping, wbc, &ewpc.ctx, &ext4_writeback_ops);
+ trace_ext4_writepages_result(inode, wbc, ret, nr - wbc->nr_to_write);
+ ext4_writepages_up_read(sb, alloc_ctx);
+
+ return ret;
}
/*
@@ -22,6 +22,7 @@
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
+#include <linux/iomap.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
@@ -565,3 +566,109 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
return 0;
}
+
+static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend)
+{
+ struct inode *inode = ioend->io_inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ loff_t pos = ioend->io_offset;
+ size_t size = ioend->io_size;
+ loff_t new_disksize;
+ handle_t *handle;
+ int credits;
+ int ret, err;
+
+ ret = blk_status_to_errno(ioend->io_bio.bi_status);
+ if (unlikely(ret))
+ goto out;
+
+ /*
+ * We may need to convert up to one extent per block in
+ * the page and we may dirty the inode.
+ */
+ credits = ext4_chunk_trans_blocks(inode,
+ EXT4_MAX_BLOCKS(size, pos, inode->i_blkbits));
+ handle = ext4_journal_start(inode, EXT4_HT_EXT_CONVERT, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_err;
+ }
+
+ if (ioend->io_type == IOMAP_UNWRITTEN) {
+ ret = ext4_convert_unwritten_extents(handle, inode, pos, size);
+ if (ret)
+ goto out_journal;
+ }
+
+ /*
+ * Update on-disk size after IO is completed. Races with
+ * truncate are avoided by checking i_size under i_data_sem.
+ */
+ new_disksize = pos + size;
+ if (new_disksize > READ_ONCE(ei->i_disksize)) {
+ down_write(&ei->i_data_sem);
+ new_disksize = min(new_disksize, i_size_read(inode));
+ if (new_disksize > ei->i_disksize)
+ ei->i_disksize = new_disksize;
+ up_write(&ei->i_data_sem);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (ret)
+ EXT4_ERROR_INODE_ERR(inode, -ret,
+ "Failed to mark inode dirty");
+ }
+
+out_journal:
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+out_err:
+ if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) {
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "failed to convert unwritten extents to "
+ "written extents or update inode size -- "
+ "potential data loss! (inode %lu, error %d)",
+ inode->i_ino, ret);
+ }
+out:
+ iomap_finish_ioends(ioend, ret);
+}
+
+/*
+ * Work on buffered iomap completed IO, to convert unwritten extents to
+ * mapped extents
+ */
+void ext4_iomap_end_io(struct work_struct *work)
+{
+ struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+ i_iomap_ioend_work);
+ struct iomap_ioend *ioend;
+ struct list_head ioend_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ list_replace_init(&ei->i_iomap_ioend_list, &ioend_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+ iomap_sort_ioends(&ioend_list);
+ while (!list_empty(&ioend_list)) {
+ ioend = list_entry(ioend_list.next, struct iomap_ioend, io_list);
+ list_del_init(&ioend->io_list);
+ iomap_ioend_try_merge(ioend, &ioend_list);
+ ext4_iomap_finish_ioend(ioend);
+ }
+}
+
+void ext4_iomap_end_bio(struct bio *bio)
+{
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+ struct ext4_inode_info *ei = EXT4_I(ioend->io_inode);
+ struct ext4_sb_info *sbi = EXT4_SB(ioend->io_inode->i_sb);
+ unsigned long flags;
+
+ /* Only reserved conversions from writeback should enter here */
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ if (list_empty(&ei->i_iomap_ioend_list))
+ queue_work(sbi->rsv_conversion_wq, &ei->i_iomap_ioend_work);
+ list_add_tail(&ioend->io_list, &ei->i_iomap_ioend_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
@@ -1431,11 +1431,13 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
#endif
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+ INIT_LIST_HEAD(&ei->i_iomap_ioend_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+ INIT_WORK(&ei->i_iomap_ioend_work, ext4_iomap_end_io);
ext4_fc_init_inode(&ei->vfs_inode);
mutex_init(&ei->i_fc_lock);
return &ei->vfs_inode;