[V2] erofs: support flattened block device for multi-blob images
Commit Message
In order to support mounting multi-blobs container image as a single
block device, add flattened block device feature for EROFS.
In this mode, all meta/data contents will be mapped into one block
address. User could compose a block device(by nbd/ublk/virtio-blk/
vhost-user-blk) from multiple sources and mount the block device by
EROFS directly. It can reduce the number of block devices used, and
it's also benefits in both VM file passthrough and distributed storage
scenarios.
You can test this using the method mentioned by:
https://github.com/dragonflyoss/image-service/pull/1111
1. Compose a (nbd)block device from multi-blobs.
2. Mount EROFS on mntdir/.
3. Compare the md5sum between source dir and mntdir/.
Later, we could also use it to refer original tar blobs.
Signed-off-by: Jia Zhu <zhujia.zj@bytedance.com>
Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
---
v2:
1. Supplement commit message.
2. Add a bool field in erofs_dev_context to indicate flattened block
device mode.
---
fs/erofs/data.c | 8 ++++++--
fs/erofs/internal.h | 1 +
fs/erofs/super.c | 6 +++++-
3 files changed, 12 insertions(+), 3 deletions(-)
Comments
On 3/1/23 8:59 PM, Jia Zhu wrote:
> In order to support mounting multi-blobs container image as a single
> block device, add flattened block device feature for EROFS.
>
> In this mode, all meta/data contents will be mapped into one block
> address. User could compose a block device(by nbd/ublk/virtio-blk/
> vhost-user-blk) from multiple sources and mount the block device by
> EROFS directly. It can reduce the number of block devices used, and
> it's also benefits in both VM file passthrough and distributed storage
> scenarios.
>
> You can test this using the method mentioned by:
> https://github.com/dragonflyoss/image-service/pull/1111
> 1. Compose a (nbd)block device from multi-blobs.
> 2. Mount EROFS on mntdir/.
> 3. Compare the md5sum between source dir and mntdir/.
>
> Later, we could also use it to refer original tar blobs.
>
> Signed-off-by: Jia Zhu <zhujia.zj@bytedance.com>
> Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
> ---
> v2:
> 1. Supplement commit message.
> 2. Add a bool field in erofs_dev_context to indicate flattened block
> device mode.
> ---
> fs/erofs/data.c | 8 ++++++--
> fs/erofs/internal.h | 1 +
> fs/erofs/super.c | 6 +++++-
> 3 files changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
> index e16545849ea7..818f78ce648c 100644
> --- a/fs/erofs/data.c
> +++ b/fs/erofs/data.c
> @@ -197,7 +197,6 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
> struct erofs_device_info *dif;
> int id;
>
> - /* primary device by default */
> map->m_bdev = sb->s_bdev;
> map->m_daxdev = EROFS_SB(sb)->dax_dev;
> map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
> @@ -210,12 +209,17 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
> up_read(&devs->rwsem);
> return -ENODEV;
> }
> + if (devs->flatdev) {
> + map->m_pa += blknr_to_addr(dif->mapped_blkaddr);
> + up_read(&devs->rwsem);
> + return 0;
> + }
> map->m_bdev = dif->bdev;
> map->m_daxdev = dif->dax_dev;
> map->m_dax_part_off = dif->dax_part_off;
> map->m_fscache = dif->fscache;
> up_read(&devs->rwsem);
> - } else if (devs->extra_devices) {
> + } else if (devs->extra_devices && !devs->flatdev) {
> down_read(&devs->rwsem);
> idr_for_each_entry(&devs->tree, dif, id) {
> erofs_off_t startoff, length;
> diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
> index 3f3561d37d1b..4fee380a98d9 100644
> --- a/fs/erofs/internal.h
> +++ b/fs/erofs/internal.h
> @@ -81,6 +81,7 @@ struct erofs_dev_context {
> struct rw_semaphore rwsem;
>
> unsigned int extra_devices;
> + bool flatdev;
> };
>
> struct erofs_fs_context {
> diff --git a/fs/erofs/super.c b/fs/erofs/super.c
> index 19b1ae79cec4..307b3d2392cf 100644
> --- a/fs/erofs/super.c
> +++ b/fs/erofs/super.c
> @@ -248,7 +248,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
> if (IS_ERR(fscache))
> return PTR_ERR(fscache);
> dif->fscache = fscache;
> - } else {
> + } else if (!sbi->devs->flatdev) {
> bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL,
> sb->s_type);
> if (IS_ERR(bdev))
> @@ -281,6 +281,10 @@ static int erofs_scan_devices(struct super_block *sb,
> else
> ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
>
> + if (!sbi->devs->extra_devices && ondisk_extradevs &&
> + !erofs_is_fscache_mode(sb))
> + sbi->devs->flatdev = true;
> +
I would move this check down after all sanity checks, e.g.
if (!ondisk_extradevs)
return 0;
+ if (!sbi->devs->extra_devices && !erofs_is_fscache_mode(sb))
+ sbi->devs->flatdev = true;
Otherwise LGTM.
@@ -197,7 +197,6 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
struct erofs_device_info *dif;
int id;
- /* primary device by default */
map->m_bdev = sb->s_bdev;
map->m_daxdev = EROFS_SB(sb)->dax_dev;
map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
@@ -210,12 +209,17 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
up_read(&devs->rwsem);
return -ENODEV;
}
+ if (devs->flatdev) {
+ map->m_pa += blknr_to_addr(dif->mapped_blkaddr);
+ up_read(&devs->rwsem);
+ return 0;
+ }
map->m_bdev = dif->bdev;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
map->m_fscache = dif->fscache;
up_read(&devs->rwsem);
- } else if (devs->extra_devices) {
+ } else if (devs->extra_devices && !devs->flatdev) {
down_read(&devs->rwsem);
idr_for_each_entry(&devs->tree, dif, id) {
erofs_off_t startoff, length;
@@ -81,6 +81,7 @@ struct erofs_dev_context {
struct rw_semaphore rwsem;
unsigned int extra_devices;
+ bool flatdev;
};
struct erofs_fs_context {
@@ -248,7 +248,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
if (IS_ERR(fscache))
return PTR_ERR(fscache);
dif->fscache = fscache;
- } else {
+ } else if (!sbi->devs->flatdev) {
bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL,
sb->s_type);
if (IS_ERR(bdev))
@@ -281,6 +281,10 @@ static int erofs_scan_devices(struct super_block *sb,
else
ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
+ if (!sbi->devs->extra_devices && ondisk_extradevs &&
+ !erofs_is_fscache_mode(sb))
+ sbi->devs->flatdev = true;
+
if (sbi->devs->extra_devices &&
ondisk_extradevs != sbi->devs->extra_devices) {
erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",