[v8,01/11] btrfs: add raid stripe tree definitions

Message ID 20230911-raid-stripe-tree-v8-1-647676fa852c@wdc.com
State New
Headers
Series btrfs: introduce RAID stripe tree |

Commit Message

Johannes Thumshirn Sept. 11, 2023, 12:52 p.m. UTC
  Add definitions for the raid stripe tree. This tree will hold information
about the on-disk layout of the stripes in a RAID set.

Each stripe extent has a 1:1 relationship with an on-disk extent item and
is doing the logical to per-drive physical address translation for the
extent item in question.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/accessors.h            | 10 ++++++++++
 fs/btrfs/locking.c              |  5 +++--
 include/uapi/linux/btrfs_tree.h | 33 +++++++++++++++++++++++++++++++--
 3 files changed, 44 insertions(+), 4 deletions(-)
  

Comments

Damien Le Moal Sept. 11, 2023, 9 p.m. UTC | #1
On 9/11/23 21:52, Johannes Thumshirn wrote:
> Add definitions for the raid stripe tree. This tree will hold information
> about the on-disk layout of the stripes in a RAID set.
> 
> Each stripe extent has a 1:1 relationship with an on-disk extent item and
> is doing the logical to per-drive physical address translation for the
> extent item in question.
> 
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>  fs/btrfs/accessors.h            | 10 ++++++++++
>  fs/btrfs/locking.c              |  5 +++--
>  include/uapi/linux/btrfs_tree.h | 33 +++++++++++++++++++++++++++++++--
>  3 files changed, 44 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
> index f958eccff477..977ff160a024 100644
> --- a/fs/btrfs/accessors.h
> +++ b/fs/btrfs/accessors.h
> @@ -306,6 +306,16 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
>  BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
>  BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
>  
> +BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
> +BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
> +BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
> +BTRFS_SETGET_FUNCS(raid_stride_length, struct btrfs_raid_stride, length, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding,
> +			 struct btrfs_stripe_extent, encoding, 8);
> +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_length, struct btrfs_raid_stride, length, 64);
> +
>  /* struct btrfs_dev_extent */
>  BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
>  BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
> diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
> index 6ac4fd8cc8dc..e7760d40feab 100644
> --- a/fs/btrfs/locking.c
> +++ b/fs/btrfs/locking.c
> @@ -58,8 +58,8 @@
>  
>  static struct btrfs_lockdep_keyset {
>  	u64			id;		/* root objectid */
> -	/* Longest entry: btrfs-block-group-00 */
> -	char			names[BTRFS_MAX_LEVEL][24];
> +	/* Longest entry: btrfs-raid-stripe-tree-00 */
> +	char			names[BTRFS_MAX_LEVEL][25];
>  	struct lock_class_key	keys[BTRFS_MAX_LEVEL];
>  } btrfs_lockdep_keysets[] = {
>  	{ .id = BTRFS_ROOT_TREE_OBJECTID,	DEFINE_NAME("root")	},
> @@ -74,6 +74,7 @@ static struct btrfs_lockdep_keyset {
>  	{ .id = BTRFS_UUID_TREE_OBJECTID,	DEFINE_NAME("uuid")	},
>  	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	DEFINE_NAME("free-space") },
>  	{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
> +	{ .id = BTRFS_RAID_STRIPE_TREE_OBJECTID,DEFINE_NAME("raid-stripe-tree") },
>  	{ .id = 0,				DEFINE_NAME("tree")	},
>  };
>  
> diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
> index fc3c32186d7e..3fb758ce3ac0 100644
> --- a/include/uapi/linux/btrfs_tree.h
> +++ b/include/uapi/linux/btrfs_tree.h
> @@ -4,9 +4,8 @@
>  
>  #include <linux/btrfs.h>
>  #include <linux/types.h>
> -#ifdef __KERNEL__
>  #include <linux/stddef.h>
> -#else
> +#ifndef __KERNEL__
>  #include <stddef.h>
>  #endif

This change seems unrelated to the RAID stripe tree. Should this be a patch on
its own ?

>  
> @@ -73,6 +72,9 @@
>  /* Holds the block group items for extent tree v2. */
>  #define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL
>  
> +/* tracks RAID stripes in block groups. */
> +#define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
> +
>  /* device stats in the device tree */
>  #define BTRFS_DEV_STATS_OBJECTID 0ULL
>  
> @@ -285,6 +287,8 @@
>   */
>  #define BTRFS_QGROUP_RELATION_KEY       246
>  
> +#define BTRFS_RAID_STRIPE_KEY		247
> +
>  /*
>   * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
>   */
> @@ -719,6 +723,31 @@ struct btrfs_free_space_header {
>  	__le64 num_bitmaps;
>  } __attribute__ ((__packed__));
>  
> +struct btrfs_raid_stride {
> +	/* btrfs device-id this raid extent lives on */
> +	__le64 devid;
> +	/* physical location on disk */
> +	__le64 physical;
> +	/* length of stride on this disk */
> +	__le64 length;
> +};
> +
> +#define BTRFS_STRIPE_DUP	0
> +#define BTRFS_STRIPE_RAID0	1
> +#define BTRFS_STRIPE_RAID1	2
> +#define BTRFS_STRIPE_RAID1C3	3
> +#define BTRFS_STRIPE_RAID1C4	4
> +#define BTRFS_STRIPE_RAID5	5
> +#define BTRFS_STRIPE_RAID6	6
> +#define BTRFS_STRIPE_RAID10	7
> +
> +struct btrfs_stripe_extent {
> +	__u8 encoding;
> +	__u8 reserved[7];
> +	/* array of raid strides this stripe is composed of */
> +	__DECLARE_FLEX_ARRAY(struct btrfs_raid_stride, strides);
> +};
> +
>  #define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
>  #define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
>  
>
  
Johannes Thumshirn Sept. 12, 2023, 6:09 a.m. UTC | #2
On 11.09.23 23:01, Damien Le Moal wrote:
>> diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
>> index fc3c32186d7e..3fb758ce3ac0 100644
>> --- a/include/uapi/linux/btrfs_tree.h
>> +++ b/include/uapi/linux/btrfs_tree.h
>> @@ -4,9 +4,8 @@
>>   
>>   #include <linux/btrfs.h>
>>   #include <linux/types.h>
>> -#ifdef __KERNEL__
>>   #include <linux/stddef.h>
>> -#else
>> +#ifndef __KERNEL__
>>   #include <stddef.h>
>>   #endif
> 
> This change seems unrelated to the RAID stripe tree. Should this be a patch on
> its own ?

Nope it isn't. This patch introduces a user of __DECLARE_FLEX_ARRAY() 
and without the moved ifdef userspace can't find the definition of it.
  
David Sterba Sept. 12, 2023, 8:32 p.m. UTC | #3
On Mon, Sep 11, 2023 at 05:52:02AM -0700, Johannes Thumshirn wrote:
> Add definitions for the raid stripe tree. This tree will hold information
> about the on-disk layout of the stripes in a RAID set.
> 
> Each stripe extent has a 1:1 relationship with an on-disk extent item and
> is doing the logical to per-drive physical address translation for the
> extent item in question.
> 
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>  fs/btrfs/accessors.h            | 10 ++++++++++
>  fs/btrfs/locking.c              |  5 +++--
>  include/uapi/linux/btrfs_tree.h | 33 +++++++++++++++++++++++++++++++--
>  3 files changed, 44 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
> index f958eccff477..977ff160a024 100644
> --- a/fs/btrfs/accessors.h
> +++ b/fs/btrfs/accessors.h
> @@ -306,6 +306,16 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
>  BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
>  BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
>  
> +BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);

What is encoding referring to?

> +BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
> +BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
> +BTRFS_SETGET_FUNCS(raid_stride_length, struct btrfs_raid_stride, length, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding,
> +			 struct btrfs_stripe_extent, encoding, 8);
> +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_length, struct btrfs_raid_stride, length, 64);
> +
>  /* struct btrfs_dev_extent */
>  BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
>  BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
> diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
> index 6ac4fd8cc8dc..e7760d40feab 100644
> --- a/fs/btrfs/locking.c
> +++ b/fs/btrfs/locking.c
> @@ -58,8 +58,8 @@
>  
>  static struct btrfs_lockdep_keyset {
>  	u64			id;		/* root objectid */
> -	/* Longest entry: btrfs-block-group-00 */
> -	char			names[BTRFS_MAX_LEVEL][24];
> +	/* Longest entry: btrfs-raid-stripe-tree-00 */
> +	char			names[BTRFS_MAX_LEVEL][25];

Length of "btrfs-raid-stripe-tree-00" is 25, there should be +1 for the
NUL, also length aligned to at least 4 is better.

>  	struct lock_class_key	keys[BTRFS_MAX_LEVEL];
>  } btrfs_lockdep_keysets[] = {
>  	{ .id = BTRFS_ROOT_TREE_OBJECTID,	DEFINE_NAME("root")	},
> @@ -74,6 +74,7 @@ static struct btrfs_lockdep_keyset {
>  	{ .id = BTRFS_UUID_TREE_OBJECTID,	DEFINE_NAME("uuid")	},
>  	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	DEFINE_NAME("free-space") },
>  	{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
> +	{ .id = BTRFS_RAID_STRIPE_TREE_OBJECTID,DEFINE_NAME("raid-stripe-tree") },

The naming is without the "tree"

>  	{ .id = 0,				DEFINE_NAME("tree")	},
>  };
>  
> diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
> index fc3c32186d7e..3fb758ce3ac0 100644
> --- a/include/uapi/linux/btrfs_tree.h
> +++ b/include/uapi/linux/btrfs_tree.h
> @@ -4,9 +4,8 @@
>  
>  #include <linux/btrfs.h>
>  #include <linux/types.h>
> -#ifdef __KERNEL__
>  #include <linux/stddef.h>
> -#else
> +#ifndef __KERNEL__
>  #include <stddef.h>
>  #endif
>  
> @@ -73,6 +72,9 @@
>  /* Holds the block group items for extent tree v2. */
>  #define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL
>  
> +/* tracks RAID stripes in block groups. */

	Tracks ...

> +#define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
> +
>  /* device stats in the device tree */
>  #define BTRFS_DEV_STATS_OBJECTID 0ULL
>  
> @@ -285,6 +287,8 @@
>   */
>  #define BTRFS_QGROUP_RELATION_KEY       246
>  
> +#define BTRFS_RAID_STRIPE_KEY		247

Any particular reason you chose 247 for the key number? It does not
leave any gap after BTRFS_QGROUP_RELATION_KEY and before
BTRFS_BALANCE_ITEM_KEY. If this is related to extents then please find
more suitable group of keys where to put it.

> +
>  /*
>   * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
>   */
> @@ -719,6 +723,31 @@ struct btrfs_free_space_header {
>  	__le64 num_bitmaps;
>  } __attribute__ ((__packed__));
>  
> +struct btrfs_raid_stride {
> +	/* btrfs device-id this raid extent lives on */

Comments should be full sentences.

> +	__le64 devid;
> +	/* physical location on disk */
> +	__le64 physical;
> +	/* length of stride on this disk */
> +	__le64 length;
> +};

__attribute__ ((__packed__));

> +
> +#define BTRFS_STRIPE_DUP	0
> +#define BTRFS_STRIPE_RAID0	1
> +#define BTRFS_STRIPE_RAID1	2
> +#define BTRFS_STRIPE_RAID1C3	3
> +#define BTRFS_STRIPE_RAID1C4	4
> +#define BTRFS_STRIPE_RAID5	5
> +#define BTRFS_STRIPE_RAID6	6
> +#define BTRFS_STRIPE_RAID10	7

This is probably defining the on-disk format so some consistency is
desired, there are already the BTRFS_BLOCK_GROUP_* types, from which the
BTRFS_RAID_* are derive, so the BTRFS_STRIPE_* values should match the
order and ideally the values themselves if possible.

> +
> +struct btrfs_stripe_extent {
> +	__u8 encoding;
> +	__u8 reserved[7];
> +	/* array of raid strides this stripe is composed of */
> +	__DECLARE_FLEX_ARRAY(struct btrfs_raid_stride, strides);

Do we really whant to declare that as __DECLARE_FLEX_ARRAY? It's not a
standard macro and obscures the definition.


> +};
> +
>  #define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
>  #define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
>  
> 
> -- 
> 2.41.0
  
Johannes Thumshirn Sept. 13, 2023, 6:02 a.m. UTC | #4
On 12.09.23 22:32, David Sterba wrote:
>> @@ -306,6 +306,16 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
>>   BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
>>   BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
>>   
>> +BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
> 
> What is encoding referring to?

At the moment (only) the RAID type. But in the future it can be expanded 
to all kinds of encodings, like Reed-Solomon, Butterfly-Codes, etc...

>>   static struct btrfs_lockdep_keyset {
>>   	u64			id;		/* root objectid */
>> -	/* Longest entry: btrfs-block-group-00 */
>> -	char			names[BTRFS_MAX_LEVEL][24];
>> +	/* Longest entry: btrfs-raid-stripe-tree-00 */
>> +	char			names[BTRFS_MAX_LEVEL][25];
> 
> Length of "btrfs-raid-stripe-tree-00" is 25, there should be +1 for the
> NUL, also length aligned to at least 4 is better.
> 

OK.

>>   	struct lock_class_key	keys[BTRFS_MAX_LEVEL];
>>   } btrfs_lockdep_keysets[] = {
>>   	{ .id = BTRFS_ROOT_TREE_OBJECTID,	DEFINE_NAME("root")	},
>> @@ -74,6 +74,7 @@ static struct btrfs_lockdep_keyset {
>>   	{ .id = BTRFS_UUID_TREE_OBJECTID,	DEFINE_NAME("uuid")	},
>>   	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	DEFINE_NAME("free-space") },
>>   	{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
>> +	{ .id = BTRFS_RAID_STRIPE_TREE_OBJECTID,DEFINE_NAME("raid-stripe-tree") },
> 
> The naming is without the "tree"

OK

>> @@ -73,6 +72,9 @@
>>   /* Holds the block group items for extent tree v2. */
>>   #define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL
>>   
>> +/* tracks RAID stripes in block groups. */
> 
> 	Tracks ...
> 

OK

>> +#define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
>> +
>>   /* device stats in the device tree */
>>   #define BTRFS_DEV_STATS_OBJECTID 0ULL
>>   
>> @@ -285,6 +287,8 @@
>>    */
>>   #define BTRFS_QGROUP_RELATION_KEY       246
>>   
>> +#define BTRFS_RAID_STRIPE_KEY		247
> 
> Any particular reason you chose 247 for the key number? It does not
> leave any gap after BTRFS_QGROUP_RELATION_KEY and before
> BTRFS_BALANCE_ITEM_KEY. If this is related to extents then please find
> more suitable group of keys where to put it.

Nope, it was just the last free spot.

> 
>> +
>>   /*
>>    * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
>>    */
>> @@ -719,6 +723,31 @@ struct btrfs_free_space_header {
>>   	__le64 num_bitmaps;
>>   } __attribute__ ((__packed__));
>>   
>> +struct btrfs_raid_stride {
>> +	/* btrfs device-id this raid extent lives on */
> 
> Comments should be full sentences.

OK

> 
>> +	__le64 devid;
>> +	/* physical location on disk */
>> +	__le64 physical;
>> +	/* length of stride on this disk */
>> +	__le64 length;
>> +};
> 
> __attribute__ ((__packed__));

The structure doesn't have any holes in it so packed is not needed.

I might also be misinformed, but doesn't packed potentially lead to bad 
code generation on some platforms?  I've always been under the 
impression that packed forces the compiler to do byte-wise loads and 
stores. But as I've said, I might be misinformed.

> 
>> +
>> +#define BTRFS_STRIPE_DUP	0
>> +#define BTRFS_STRIPE_RAID0	1
>> +#define BTRFS_STRIPE_RAID1	2
>> +#define BTRFS_STRIPE_RAID1C3	3
>> +#define BTRFS_STRIPE_RAID1C4	4
>> +#define BTRFS_STRIPE_RAID5	5
>> +#define BTRFS_STRIPE_RAID6	6
>> +#define BTRFS_STRIPE_RAID10	7
> 
> This is probably defining the on-disk format so some consistency is
> desired, there are already the BTRFS_BLOCK_GROUP_* types, from which the
> BTRFS_RAID_* are derive, so the BTRFS_STRIPE_* values should match the
> order and ideally the values themselves if possible.
> 
>> +
>> +struct btrfs_stripe_extent {
>> +	__u8 encoding;
>> +	__u8 reserved[7];
>> +	/* array of raid strides this stripe is composed of */
>> +	__DECLARE_FLEX_ARRAY(struct btrfs_raid_stride, strides);
> 
> Do we really whant to declare that as __DECLARE_FLEX_ARRAY? It's not a
> standard macro and obscures the definition.
> 

Indeed we do not anymore, as this version does introduce another u64 
before the strides array! I'll gladly get rid of it.
  
David Sterba Sept. 13, 2023, 2:49 p.m. UTC | #5
On Wed, Sep 13, 2023 at 06:02:09AM +0000, Johannes Thumshirn wrote:
> On 12.09.23 22:32, David Sterba wrote:
> >> @@ -306,6 +306,16 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
> >>   BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
> >>   BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
> >>   
> >> +BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
> > 
> > What is encoding referring to?
> 
> At the moment (only) the RAID type. But in the future it can be expanded 
> to all kinds of encodings, like Reed-Solomon, Butterfly-Codes, etc...

I see, could it be better called ECC? Like stripe_extent_ecc, that would
be clear that it's for the correction, encoding sounds is too generic.

> >> +	__le64 devid;
> >> +	/* physical location on disk */
> >> +	__le64 physical;
> >> +	/* length of stride on this disk */
> >> +	__le64 length;
> >> +};
> > 
> > __attribute__ ((__packed__));
> 
> The structure doesn't have any holes in it so packed is not needed.
> 
> I might also be misinformed, but doesn't packed potentially lead to bad 
> code generation on some platforms?  I've always been under the 
> impression that packed forces the compiler to do byte-wise loads and 
> stores. But as I've said, I might be misinformed.

All on-disk structures have the packed attribute so for consistency and
future safety it should be here too, even if it technically does not
need it due to alignment. In addition, strucutres that need padding
would be also problematic, e.g. u64 followed by u32 needs 4 bytes of
padding but the next item after it would be placed right after u32.

It's right that on some platforms unaligned access is done by more code
but for the same reason on such platforms we can't let the compiler
decide the layout when the structure is directly mapped onto the blocks.
  
Johannes Thumshirn Sept. 13, 2023, 2:57 p.m. UTC | #6
On 13.09.23 16:50, David Sterba wrote:
> On Wed, Sep 13, 2023 at 06:02:09AM +0000, Johannes Thumshirn wrote:
>> On 12.09.23 22:32, David Sterba wrote:
>>>> @@ -306,6 +306,16 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
>>>>    BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
>>>>    BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
>>>>    
>>>> +BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
>>>
>>> What is encoding referring to?
>>
>> At the moment (only) the RAID type. But in the future it can be expanded
>> to all kinds of encodings, like Reed-Solomon, Butterfly-Codes, etc...
> 
> I see, could it be better called ECC? Like stripe_extent_ecc, that would
> be clear that it's for the correction, encoding sounds is too generic.

Hmm but for RAID0 there is no correction, so not really as well. I'd 
suggest 'type', but I /think/ for RAID5/6 we'll need type=data and 
type=parity (and future ECC as well).

Maybe level, as in RAID level? I know currently it is redundant, as we 
can derive it from the block-group.
  
David Sterba Sept. 13, 2023, 4:06 p.m. UTC | #7
On Wed, Sep 13, 2023 at 02:57:50PM +0000, Johannes Thumshirn wrote:
> On 13.09.23 16:50, David Sterba wrote:
> > On Wed, Sep 13, 2023 at 06:02:09AM +0000, Johannes Thumshirn wrote:
> >> On 12.09.23 22:32, David Sterba wrote:
> >>>> @@ -306,6 +306,16 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
> >>>>    BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
> >>>>    BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
> >>>>    
> >>>> +BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
> >>>
> >>> What is encoding referring to?
> >>
> >> At the moment (only) the RAID type. But in the future it can be expanded
> >> to all kinds of encodings, like Reed-Solomon, Butterfly-Codes, etc...
> > 
> > I see, could it be better called ECC? Like stripe_extent_ecc, that would
> > be clear that it's for the correction, encoding sounds is too generic.
> 
> Hmm but for RAID0 there is no correction, so not really as well. I'd 
> suggest 'type', but I /think/ for RAID5/6 we'll need type=data and 
> type=parity (and future ECC as well).
> 
> Maybe level, as in RAID level? I know currently it is redundant, as we 
> can derive it from the block-group.

Ok, let's keep encoding, we might actually need the genric meaning, what
I was missing was the context.
  

Patch

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index f958eccff477..977ff160a024 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -306,6 +306,16 @@  BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
 
+BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
+BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+BTRFS_SETGET_FUNCS(raid_stride_length, struct btrfs_raid_stride, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding,
+			 struct btrfs_stripe_extent, encoding, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_length, struct btrfs_raid_stride, length, 64);
+
 /* struct btrfs_dev_extent */
 BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
 BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 6ac4fd8cc8dc..e7760d40feab 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -58,8 +58,8 @@ 
 
 static struct btrfs_lockdep_keyset {
 	u64			id;		/* root objectid */
-	/* Longest entry: btrfs-block-group-00 */
-	char			names[BTRFS_MAX_LEVEL][24];
+	/* Longest entry: btrfs-raid-stripe-tree-00 */
+	char			names[BTRFS_MAX_LEVEL][25];
 	struct lock_class_key	keys[BTRFS_MAX_LEVEL];
 } btrfs_lockdep_keysets[] = {
 	{ .id = BTRFS_ROOT_TREE_OBJECTID,	DEFINE_NAME("root")	},
@@ -74,6 +74,7 @@  static struct btrfs_lockdep_keyset {
 	{ .id = BTRFS_UUID_TREE_OBJECTID,	DEFINE_NAME("uuid")	},
 	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	DEFINE_NAME("free-space") },
 	{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
+	{ .id = BTRFS_RAID_STRIPE_TREE_OBJECTID,DEFINE_NAME("raid-stripe-tree") },
 	{ .id = 0,				DEFINE_NAME("tree")	},
 };
 
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index fc3c32186d7e..3fb758ce3ac0 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -4,9 +4,8 @@ 
 
 #include <linux/btrfs.h>
 #include <linux/types.h>
-#ifdef __KERNEL__
 #include <linux/stddef.h>
-#else
+#ifndef __KERNEL__
 #include <stddef.h>
 #endif
 
@@ -73,6 +72,9 @@ 
 /* Holds the block group items for extent tree v2. */
 #define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL
 
+/* tracks RAID stripes in block groups. */
+#define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
+
 /* device stats in the device tree */
 #define BTRFS_DEV_STATS_OBJECTID 0ULL
 
@@ -285,6 +287,8 @@ 
  */
 #define BTRFS_QGROUP_RELATION_KEY       246
 
+#define BTRFS_RAID_STRIPE_KEY		247
+
 /*
  * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
  */
@@ -719,6 +723,31 @@  struct btrfs_free_space_header {
 	__le64 num_bitmaps;
 } __attribute__ ((__packed__));
 
+struct btrfs_raid_stride {
+	/* btrfs device-id this raid extent lives on */
+	__le64 devid;
+	/* physical location on disk */
+	__le64 physical;
+	/* length of stride on this disk */
+	__le64 length;
+};
+
+#define BTRFS_STRIPE_DUP	0
+#define BTRFS_STRIPE_RAID0	1
+#define BTRFS_STRIPE_RAID1	2
+#define BTRFS_STRIPE_RAID1C3	3
+#define BTRFS_STRIPE_RAID1C4	4
+#define BTRFS_STRIPE_RAID5	5
+#define BTRFS_STRIPE_RAID6	6
+#define BTRFS_STRIPE_RAID10	7
+
+struct btrfs_stripe_extent {
+	__u8 encoding;
+	__u8 reserved[7];
+	/* array of raid strides this stripe is composed of */
+	__DECLARE_FLEX_ARRAY(struct btrfs_raid_stride, strides);
+};
+
 #define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
 #define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)