[v9,01/11] btrfs: add raid stripe tree definitions

Message ID 20230914-raid-stripe-tree-v9-1-15d423829637@wdc.com
State New
Headers
Series btrfs: introduce RAID stripe tree |

Commit Message

Johannes Thumshirn Sept. 14, 2023, 4:06 p.m. UTC
  Add definitions for the raid stripe tree. This tree will hold information
about the on-disk layout of the stripes in a RAID set.

Each stripe extent has a 1:1 relationship with an on-disk extent item and
is doing the logical to per-drive physical address translation for the
extent item in question.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/accessors.h            | 10 ++++++++++
 fs/btrfs/locking.c              |  1 +
 include/uapi/linux/btrfs_tree.h | 31 +++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+)
  

Comments

Qu Wenruo Sept. 15, 2023, 12:22 a.m. UTC | #1
On 2023/9/15 01:36, Johannes Thumshirn wrote:
> Add definitions for the raid stripe tree. This tree will hold information
> about the on-disk layout of the stripes in a RAID set.
> 
> Each stripe extent has a 1:1 relationship with an on-disk extent item and
> is doing the logical to per-drive physical address translation for the
> extent item in question.
> 
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>   fs/btrfs/accessors.h            | 10 ++++++++++
>   fs/btrfs/locking.c              |  1 +
>   include/uapi/linux/btrfs_tree.h | 31 +++++++++++++++++++++++++++++++
>   3 files changed, 42 insertions(+)
> 
> diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
> index f958eccff477..977ff160a024 100644
> --- a/fs/btrfs/accessors.h
> +++ b/fs/btrfs/accessors.h
> @@ -306,6 +306,16 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
>   BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
>   BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
>   
> +BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
> +BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
> +BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
> +BTRFS_SETGET_FUNCS(raid_stride_length, struct btrfs_raid_stride, length, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding,
> +			 struct btrfs_stripe_extent, encoding, 8);
> +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_length, struct btrfs_raid_stride, length, 64);
> +
>   /* struct btrfs_dev_extent */
>   BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
>   BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
> diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
> index 6ac4fd8cc8dc..74d8e2003f58 100644
> --- a/fs/btrfs/locking.c
> +++ b/fs/btrfs/locking.c
> @@ -74,6 +74,7 @@ static struct btrfs_lockdep_keyset {
>   	{ .id = BTRFS_UUID_TREE_OBJECTID,	DEFINE_NAME("uuid")	},
>   	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	DEFINE_NAME("free-space") },
>   	{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
> +	{ .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") },
>   	{ .id = 0,				DEFINE_NAME("tree")	},
>   };
>   
> diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
> index fc3c32186d7e..6d9c43416b6e 100644
> --- a/include/uapi/linux/btrfs_tree.h
> +++ b/include/uapi/linux/btrfs_tree.h
> @@ -73,6 +73,9 @@
>   /* Holds the block group items for extent tree v2. */
>   #define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL
>   
> +/* Tracks RAID stripes in block groups. */
> +#define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
> +
>   /* device stats in the device tree */
>   #define BTRFS_DEV_STATS_OBJECTID 0ULL
>   
> @@ -261,6 +264,8 @@
>   #define BTRFS_DEV_ITEM_KEY	216
>   #define BTRFS_CHUNK_ITEM_KEY	228
>   
> +#define BTRFS_RAID_STRIPE_KEY	230
> +
>   /*
>    * Records the overall state of the qgroups.
>    * There's only one instance of this key present,
> @@ -719,6 +724,32 @@ struct btrfs_free_space_header {
>   	__le64 num_bitmaps;
>   } __attribute__ ((__packed__));
>   
> +struct btrfs_raid_stride {
> +	/* The btrfs device-id this raid extent lives on */
> +	__le64 devid;
> +	/* The physical location on disk */
> +	__le64 physical;
> +	/* The length of stride on this disk */
> +	__le64 length;
> +} __attribute__ ((__packed__));
> +
> +/* The stripe_extent::encoding, 1:1 mapping of enum btrfs_raid_types */
> +#define BTRFS_STRIPE_RAID0	1
> +#define BTRFS_STRIPE_RAID1	2
> +#define BTRFS_STRIPE_DUP	3
> +#define BTRFS_STRIPE_RAID10	4
> +#define BTRFS_STRIPE_RAID5	5
> +#define BTRFS_STRIPE_RAID6	6
> +#define BTRFS_STRIPE_RAID1C3	7
> +#define BTRFS_STRIPE_RAID1C4	8
> +
> +struct btrfs_stripe_extent {
> +	__u8 encoding;

Considerng the encoding for now is 1:1 map of btrfs_raid_types, and 
normally we use variable like @raid_index for such usage, have 
considered rename it to "raid_index" or "profile_index" instead?

Another thing is, you may want to add extra tree-checker code to verify 
the btrfs_stripe_extent members.

For encoding, it should be all be the known numbers, and item size for 
alignment.

The same for physical/length alignment checks.

Thanks,
Qu
> +	__u8 reserved[7];
> +	/* An array of raid strides this stripe is composed of */
> +	struct btrfs_raid_stride strides[];
> +} __attribute__ ((__packed__));
> +
>   #define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
>   #define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
>   
>
  
Qu Wenruo Sept. 15, 2023, 12:26 a.m. UTC | #2
On 2023/9/15 09:52, Qu Wenruo wrote:
> 
> 
> On 2023/9/15 01:36, Johannes Thumshirn wrote:
>> Add definitions for the raid stripe tree. This tree will hold information
>> about the on-disk layout of the stripes in a RAID set.
>>
>> Each stripe extent has a 1:1 relationship with an on-disk extent item and
>> is doing the logical to per-drive physical address translation for the
>> extent item in question.
>>
>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>> ---
>>   fs/btrfs/accessors.h            | 10 ++++++++++
>>   fs/btrfs/locking.c              |  1 +
>>   include/uapi/linux/btrfs_tree.h | 31 +++++++++++++++++++++++++++++++
>>   3 files changed, 42 insertions(+)
>>
>> diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
>> index f958eccff477..977ff160a024 100644
>> --- a/fs/btrfs/accessors.h
>> +++ b/fs/btrfs/accessors.h
>> @@ -306,6 +306,16 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct 
>> btrfs_timespec, nsec, 32);
>>   BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, 
>> sec, 64);
>>   BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, 
>> nsec, 32);
>> +BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct 
>> btrfs_stripe_extent, encoding, 8);
>> +BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, 
>> devid, 64);
>> +BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, 
>> physical, 64);
>> +BTRFS_SETGET_FUNCS(raid_stride_length, struct btrfs_raid_stride, 
>> length, 64);
>> +BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding,
>> +             struct btrfs_stripe_extent, encoding, 8);
>> +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct 
>> btrfs_raid_stride, devid, 64);
>> +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct 
>> btrfs_raid_stride, physical, 64);
>> +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_length, struct 
>> btrfs_raid_stride, length, 64);
>> +
>>   /* struct btrfs_dev_extent */
>>   BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, 
>> chunk_tree, 64);
>>   BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
>> diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
>> index 6ac4fd8cc8dc..74d8e2003f58 100644
>> --- a/fs/btrfs/locking.c
>> +++ b/fs/btrfs/locking.c
>> @@ -74,6 +74,7 @@ static struct btrfs_lockdep_keyset {
>>       { .id = BTRFS_UUID_TREE_OBJECTID,    DEFINE_NAME("uuid")    },
>>       { .id = BTRFS_FREE_SPACE_TREE_OBJECTID,    
>> DEFINE_NAME("free-space") },
>>       { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, 
>> DEFINE_NAME("block-group") },
>> +    { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, 
>> DEFINE_NAME("raid-stripe") },
>>       { .id = 0,                DEFINE_NAME("tree")    },
>>   };
>> diff --git a/include/uapi/linux/btrfs_tree.h 
>> b/include/uapi/linux/btrfs_tree.h
>> index fc3c32186d7e..6d9c43416b6e 100644
>> --- a/include/uapi/linux/btrfs_tree.h
>> +++ b/include/uapi/linux/btrfs_tree.h
>> @@ -73,6 +73,9 @@
>>   /* Holds the block group items for extent tree v2. */
>>   #define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL
>> +/* Tracks RAID stripes in block groups. */
>> +#define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
>> +
>>   /* device stats in the device tree */
>>   #define BTRFS_DEV_STATS_OBJECTID 0ULL
>> @@ -261,6 +264,8 @@
>>   #define BTRFS_DEV_ITEM_KEY    216
>>   #define BTRFS_CHUNK_ITEM_KEY    228
>> +#define BTRFS_RAID_STRIPE_KEY    230
>> +
>>   /*
>>    * Records the overall state of the qgroups.
>>    * There's only one instance of this key present,
>> @@ -719,6 +724,32 @@ struct btrfs_free_space_header {
>>       __le64 num_bitmaps;
>>   } __attribute__ ((__packed__));
>> +struct btrfs_raid_stride {
>> +    /* The btrfs device-id this raid extent lives on */
>> +    __le64 devid;
>> +    /* The physical location on disk */
>> +    __le64 physical;
>> +    /* The length of stride on this disk */
>> +    __le64 length;

Forgot to mention, for btrfs_stripe_extent structure, its key is 
(PHYSICAL, RAID_STRIPE_KEY, LENGTH) right?

So is the length in the btrfs_raid_stride duplicated and we can save 8 
bytes?

Thanks,
Qu
>> +} __attribute__ ((__packed__));
>> +
>> +/* The stripe_extent::encoding, 1:1 mapping of enum btrfs_raid_types */
>> +#define BTRFS_STRIPE_RAID0    1
>> +#define BTRFS_STRIPE_RAID1    2
>> +#define BTRFS_STRIPE_DUP    3
>> +#define BTRFS_STRIPE_RAID10    4
>> +#define BTRFS_STRIPE_RAID5    5
>> +#define BTRFS_STRIPE_RAID6    6
>> +#define BTRFS_STRIPE_RAID1C3    7
>> +#define BTRFS_STRIPE_RAID1C4    8
>> +
>> +struct btrfs_stripe_extent {
>> +    __u8 encoding;
> 
> Considerng the encoding for now is 1:1 map of btrfs_raid_types, and 
> normally we use variable like @raid_index for such usage, have 
> considered rename it to "raid_index" or "profile_index" instead?
> 
> Another thing is, you may want to add extra tree-checker code to verify 
> the btrfs_stripe_extent members.
> 
> For encoding, it should be all be the known numbers, and item size for 
> alignment.
> 
> The same for physical/length alignment checks.
> 
> Thanks,
> Qu
>> +    __u8 reserved[7];
>> +    /* An array of raid strides this stripe is composed of */
>> +    struct btrfs_raid_stride strides[];
>> +} __attribute__ ((__packed__));
>> +
>>   #define BTRFS_HEADER_FLAG_WRITTEN    (1ULL << 0)
>>   #define BTRFS_HEADER_FLAG_RELOC        (1ULL << 1)
>>
  
Johannes Thumshirn Sept. 15, 2023, 9:55 a.m. UTC | #3
On 15.09.23 02:27, Qu Wenruo wrote:
>>>    /*
>>>     * Records the overall state of the qgroups.
>>>     * There's only one instance of this key present,
>>> @@ -719,6 +724,32 @@ struct btrfs_free_space_header {
>>>        __le64 num_bitmaps;
>>>    } __attribute__ ((__packed__));
>>> +struct btrfs_raid_stride {
>>> +    /* The btrfs device-id this raid extent lives on */
>>> +    __le64 devid;
>>> +    /* The physical location on disk */
>>> +    __le64 physical;
>>> +    /* The length of stride on this disk */
>>> +    __le64 length;
> 
> Forgot to mention, for btrfs_stripe_extent structure, its key is
> (PHYSICAL, RAID_STRIPE_KEY, LENGTH) right?
> 
> So is the length in the btrfs_raid_stride duplicated and we can save 8
> bytes?

Nope. The length in the key is the stripe length. The length in the 
stride is the stride length.

Here's an example for why this is needed:

wrote 32768/32768 bytes at offset 0
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
wrote 131072/131072 bytes at offset 0
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
wrote 8192/8192 bytes at offset 65536
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)

[snip]

         item 0 key (XXXXXX RAID_STRIPE_KEY 32768) itemoff XXXXX itemsize 32
                         encoding: RAID0
                         stripe 0 devid 1 physical XXXXXXXXX length 32768
         item 1 key (XXXXXX RAID_STRIPE_KEY 131072) itemoff XXXXX 
itemsize 80
                         encoding: RAID0
                         stripe 0 devid 1 physical XXXXXXXXX length 32768
                         stripe 1 devid 2 physical XXXXXXXXX length 65536
                         stripe 2 devid 1 physical XXXXXXXXX length 32768
         item 2 key (XXXXXX RAID_STRIPE_KEY 8192) itemoff XXXXX itemsize 32
                         encoding: RAID0
                         stripe 0 devid 1 physical XXXXXXXXX length 8192

Without the length in the stride, we don't know when to select the next 
stride in item 1 above.
  
Qu Wenruo Sept. 15, 2023, 10:33 a.m. UTC | #4
On 2023/9/15 19:25, Johannes Thumshirn wrote:
> On 15.09.23 02:27, Qu Wenruo wrote:
>>>>     /*
>>>>      * Records the overall state of the qgroups.
>>>>      * There's only one instance of this key present,
>>>> @@ -719,6 +724,32 @@ struct btrfs_free_space_header {
>>>>         __le64 num_bitmaps;
>>>>     } __attribute__ ((__packed__));
>>>> +struct btrfs_raid_stride {
>>>> +    /* The btrfs device-id this raid extent lives on */
>>>> +    __le64 devid;
>>>> +    /* The physical location on disk */
>>>> +    __le64 physical;
>>>> +    /* The length of stride on this disk */
>>>> +    __le64 length;
>>
>> Forgot to mention, for btrfs_stripe_extent structure, its key is
>> (PHYSICAL, RAID_STRIPE_KEY, LENGTH) right?
>>
>> So is the length in the btrfs_raid_stride duplicated and we can save 8
>> bytes?
>
> Nope. The length in the key is the stripe length. The length in the
> stride is the stride length.
>
> Here's an example for why this is needed:
>
> wrote 32768/32768 bytes at offset 0
> XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
> wrote 131072/131072 bytes at offset 0
> XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
> wrote 8192/8192 bytes at offset 65536
> XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
>
> [snip]
>
>           item 0 key (XXXXXX RAID_STRIPE_KEY 32768) itemoff XXXXX itemsize 32
>                           encoding: RAID0
>                           stripe 0 devid 1 physical XXXXXXXXX length 32768
>           item 1 key (XXXXXX RAID_STRIPE_KEY 131072) itemoff XXXXX
> itemsize 80

Maybe you want to put the whole RAID_STRIPE_KEY definition into the headers.

In fact my initial assumption of such case would be something like this:

            item 0 key (X+0 RAID_STRIPE 32K)
		stripe 0 devid 1 physical XXXXX len 32K
	   item 1 key (X+32K RAID_STRIPE 32K)
		stripe 0 devid 1 physical XXXXX + 32K len 32K
	   item 2 key (X+64K RAID_STRIPE 64K)
		stripe 0 devid 2 physical YYYYY len 64K
	   item 3 key (X+128K RAID_STRIPE 32K)
		stripe 0 devid 1 physical XXXXX + 64K len 32K
            ...

AKA, each RAID_STRIPE_KEY would only contain a continous physical stripe.
And in above case, item 0 and item 1 can be easily merged, also length
can be removed.

And this explains why the lookup code is more complex than I initially
thought.

BTW, would the above layout make the code a little easier?
Or is there any special reason for the existing one layout?

Thank,
Qu


>                           encoding: RAID0
>                           stripe 0 devid 1 physical XXXXXXXXX length 32768
>                           stripe 1 devid 2 physical XXXXXXXXX length 65536
>                           stripe 2 devid 1 physical XXXXXXXXX length 32768

This current layout has another problem.
For RAID10 the interpretation of the RAID_STRIPE item can be very complex.
While

>           item 2 key (XXXXXX RAID_STRIPE_KEY 8192) itemoff XXXXX itemsize 32
>                           encoding: RAID0
>                           stripe 0 devid 1 physical XXXXXXXXX length 8192
>
> Without the length in the stride, we don't know when to select the next
> stride in item 1 above.
  
Johannes Thumshirn Sept. 15, 2023, 10:46 a.m. UTC | #5
On 15.09.23 12:34, Qu Wenruo wrote:
>> [snip]
>>
>>            item 0 key (XXXXXX RAID_STRIPE_KEY 32768) itemoff XXXXX itemsize 32
>>                            encoding: RAID0
>>                            stripe 0 devid 1 physical XXXXXXXXX length 32768
>>            item 1 key (XXXXXX RAID_STRIPE_KEY 131072) itemoff XXXXX
>> itemsize 80
> Maybe you want to put the whole RAID_STRIPE_KEY definition into the headers.
> 
> In fact my initial assumption of such case would be something like this:
> 
>              item 0 key (X+0 RAID_STRIPE 32K)
> 		stripe 0 devid 1 physical XXXXX len 32K
> 	   item 1 key (X+32K RAID_STRIPE 32K)
> 		stripe 0 devid 1 physical XXXXX + 32K len 32K
> 	   item 2 key (X+64K RAID_STRIPE 64K)
> 		stripe 0 devid 2 physical YYYYY len 64K
> 	   item 3 key (X+128K RAID_STRIPE 32K)
> 		stripe 0 devid 1 physical XXXXX + 64K len 32K
>              ...
> 
> AKA, each RAID_STRIPE_KEY would only contain a continous physical stripe.
> And in above case, item 0 and item 1 can be easily merged, also length
> can be removed.
> 
> And this explains why the lookup code is more complex than I initially
> thought.
> 
> BTW, would the above layout make the code a little easier?
> Or is there any special reason for the existing one layout?

It would definitely make the code easier to the cost of more items. But 
of cause smaller items, as we can get rid of the stride length.

Let me think about it.
  
Johannes Thumshirn Oct. 2, 2023, 9:32 a.m. UTC | #6
On 15.09.23 12:34, Qu Wenruo wrote:
> 
> 
> 
> On 2023/9/15 19:25, Johannes Thumshirn wrote:
>> On 15.09.23 02:27, Qu Wenruo wrote:
>>>>>      /*
>>>>>       * Records the overall state of the qgroups.
>>>>>       * There's only one instance of this key present,
>>>>> @@ -719,6 +724,32 @@ struct btrfs_free_space_header {
>>>>>          __le64 num_bitmaps;
>>>>>      } __attribute__ ((__packed__));
>>>>> +struct btrfs_raid_stride {
>>>>> +    /* The btrfs device-id this raid extent lives on */
>>>>> +    __le64 devid;
>>>>> +    /* The physical location on disk */
>>>>> +    __le64 physical;
>>>>> +    /* The length of stride on this disk */
>>>>> +    __le64 length;
>>>
>>> Forgot to mention, for btrfs_stripe_extent structure, its key is
>>> (PHYSICAL, RAID_STRIPE_KEY, LENGTH) right?
>>>
>>> So is the length in the btrfs_raid_stride duplicated and we can save 8
>>> bytes?
>>
>> Nope. The length in the key is the stripe length. The length in the
>> stride is the stride length.
>>
>> Here's an example for why this is needed:
>>
>> wrote 32768/32768 bytes at offset 0
>> XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
>> wrote 131072/131072 bytes at offset 0
>> XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
>> wrote 8192/8192 bytes at offset 65536
>> XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
>>
>> [snip]
>>
>>            item 0 key (XXXXXX RAID_STRIPE_KEY 32768) itemoff XXXXX itemsize 32
>>                            encoding: RAID0
>>                            stripe 0 devid 1 physical XXXXXXXXX length 32768
>>            item 1 key (XXXXXX RAID_STRIPE_KEY 131072) itemoff XXXXX
>> itemsize 80
> 
> Maybe you want to put the whole RAID_STRIPE_KEY definition into the headers.
> 
> In fact my initial assumption of such case would be something like this:
> 
>              item 0 key (X+0 RAID_STRIPE 32K)
> 		stripe 0 devid 1 physical XXXXX len 32K
> 	   item 1 key (X+32K RAID_STRIPE 32K)
> 		stripe 0 devid 1 physical XXXXX + 32K len 32K
> 	   item 2 key (X+64K RAID_STRIPE 64K)
> 		stripe 0 devid 2 physical YYYYY len 64K
> 	   item 3 key (X+128K RAID_STRIPE 32K)
> 		stripe 0 devid 1 physical XXXXX + 64K len 32K
>              ...
> 
> AKA, each RAID_STRIPE_KEY would only contain a continous physical stripe.
> And in above case, item 0 and item 1 can be easily merged, also length
> can be removed.
> 
> And this explains why the lookup code is more complex than I initially
> thought.
> 
> BTW, would the above layout make the code a little easier?
> Or is there any special reason for the existing one layout?
> 
> Thank,
> Qu
> 
> 
>>                            encoding: RAID0
>>                            stripe 0 devid 1 physical XXXXXXXXX length 32768
>>                            stripe 1 devid 2 physical XXXXXXXXX length 65536
>>                            stripe 2 devid 1 physical XXXXXXXXX length 32768
> 
> This current layout has another problem.
> For RAID10 the interpretation of the RAID_STRIPE item can be very complex.
> While
> 
>>            item 2 key (XXXXXX RAID_STRIPE_KEY 8192) itemoff XXXXX itemsize 32
>>                            encoding: RAID0
>>                            stripe 0 devid 1 physical XXXXXXXXX length 8192
>>
>> Without the length in the stride, we don't know when to select the next
>> stride in item 1 above.
> 


JFYI preliminary tests for your suggestion look reasonably good. I'll 
give it some more testing and code cleanup but it actually seems 
sensible to do.
  

Patch

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index f958eccff477..977ff160a024 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -306,6 +306,16 @@  BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
 
+BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
+BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+BTRFS_SETGET_FUNCS(raid_stride_length, struct btrfs_raid_stride, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding,
+			 struct btrfs_stripe_extent, encoding, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_length, struct btrfs_raid_stride, length, 64);
+
 /* struct btrfs_dev_extent */
 BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
 BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 6ac4fd8cc8dc..74d8e2003f58 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -74,6 +74,7 @@  static struct btrfs_lockdep_keyset {
 	{ .id = BTRFS_UUID_TREE_OBJECTID,	DEFINE_NAME("uuid")	},
 	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	DEFINE_NAME("free-space") },
 	{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
+	{ .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") },
 	{ .id = 0,				DEFINE_NAME("tree")	},
 };
 
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index fc3c32186d7e..6d9c43416b6e 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -73,6 +73,9 @@ 
 /* Holds the block group items for extent tree v2. */
 #define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL
 
+/* Tracks RAID stripes in block groups. */
+#define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
+
 /* device stats in the device tree */
 #define BTRFS_DEV_STATS_OBJECTID 0ULL
 
@@ -261,6 +264,8 @@ 
 #define BTRFS_DEV_ITEM_KEY	216
 #define BTRFS_CHUNK_ITEM_KEY	228
 
+#define BTRFS_RAID_STRIPE_KEY	230
+
 /*
  * Records the overall state of the qgroups.
  * There's only one instance of this key present,
@@ -719,6 +724,32 @@  struct btrfs_free_space_header {
 	__le64 num_bitmaps;
 } __attribute__ ((__packed__));
 
+struct btrfs_raid_stride {
+	/* The btrfs device-id this raid extent lives on */
+	__le64 devid;
+	/* The physical location on disk */
+	__le64 physical;
+	/* The length of stride on this disk */
+	__le64 length;
+} __attribute__ ((__packed__));
+
+/* The stripe_extent::encoding, 1:1 mapping of enum btrfs_raid_types */
+#define BTRFS_STRIPE_RAID0	1
+#define BTRFS_STRIPE_RAID1	2
+#define BTRFS_STRIPE_DUP	3
+#define BTRFS_STRIPE_RAID10	4
+#define BTRFS_STRIPE_RAID5	5
+#define BTRFS_STRIPE_RAID6	6
+#define BTRFS_STRIPE_RAID1C3	7
+#define BTRFS_STRIPE_RAID1C4	8
+
+struct btrfs_stripe_extent {
+	__u8 encoding;
+	__u8 reserved[7];
+	/* An array of raid strides this stripe is composed of */
+	struct btrfs_raid_stride strides[];
+} __attribute__ ((__packed__));
+
 #define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
 #define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)