[01/21] bcachefs: KEY_TYPE_accounting

Message ID 20240225023826.2413565-2-kent.overstreet@linux.dev
State New
Headers
Series bcachefs disk accounting rewrite |

Commit Message

Kent Overstreet Feb. 25, 2024, 2:38 a.m. UTC
  New key type for the disk space accounting rewrite.

 - Holds a variable sized array of u64s (may be more than one for
   accounting e.g. compressed and uncompressed size, or buckets and
   sectors for a given data type)

 - Updates are deltas, not new versions of the key: this means updates
   to accounting can happen via the btree write buffer, which we'll be
   teaching to accumulate deltas.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile                 |   3 +-
 fs/bcachefs/bcachefs.h               |   1 +
 fs/bcachefs/bcachefs_format.h        |  80 +++------------
 fs/bcachefs/bkey_methods.c           |   1 +
 fs/bcachefs/disk_accounting.c        |  70 ++++++++++++++
 fs/bcachefs/disk_accounting.h        |  52 ++++++++++
 fs/bcachefs/disk_accounting_format.h | 139 +++++++++++++++++++++++++++
 fs/bcachefs/replicas_format.h        |  21 ++++
 fs/bcachefs/sb-downgrade.c           |  12 ++-
 fs/bcachefs/sb-errors_types.h        |   3 +-
 10 files changed, 311 insertions(+), 71 deletions(-)
 create mode 100644 fs/bcachefs/disk_accounting.c
 create mode 100644 fs/bcachefs/disk_accounting.h
 create mode 100644 fs/bcachefs/disk_accounting_format.h
 create mode 100644 fs/bcachefs/replicas_format.h
  

Comments

Brian Foster Feb. 27, 2024, 3:49 p.m. UTC | #1
On Sat, Feb 24, 2024 at 09:38:03PM -0500, Kent Overstreet wrote:
> New key type for the disk space accounting rewrite.
> 
>  - Holds a variable sized array of u64s (may be more than one for
>    accounting e.g. compressed and uncompressed size, or buckets and
>    sectors for a given data type)
> 
>  - Updates are deltas, not new versions of the key: this means updates
>    to accounting can happen via the btree write buffer, which we'll be
>    teaching to accumulate deltas.
> 
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> ---
>  fs/bcachefs/Makefile                 |   3 +-
>  fs/bcachefs/bcachefs.h               |   1 +
>  fs/bcachefs/bcachefs_format.h        |  80 +++------------
>  fs/bcachefs/bkey_methods.c           |   1 +
>  fs/bcachefs/disk_accounting.c        |  70 ++++++++++++++
>  fs/bcachefs/disk_accounting.h        |  52 ++++++++++
>  fs/bcachefs/disk_accounting_format.h | 139 +++++++++++++++++++++++++++
>  fs/bcachefs/replicas_format.h        |  21 ++++
>  fs/bcachefs/sb-downgrade.c           |  12 ++-
>  fs/bcachefs/sb-errors_types.h        |   3 +-
>  10 files changed, 311 insertions(+), 71 deletions(-)
>  create mode 100644 fs/bcachefs/disk_accounting.c
>  create mode 100644 fs/bcachefs/disk_accounting.h
>  create mode 100644 fs/bcachefs/disk_accounting_format.h
>  create mode 100644 fs/bcachefs/replicas_format.h
> 
..
> diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
> new file mode 100644
> index 000000000000..e06a42f0d578
> --- /dev/null
> +++ b/fs/bcachefs/disk_accounting_format.h
> @@ -0,0 +1,139 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
> +#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
> +
> +#include "replicas_format.h"
> +
> +/*
> + * Disk accounting - KEY_TYPE_accounting - on disk format:
> + *
> + * Here, the key has considerably more structure than a typical key (bpos); an
> + * accounting key is 'struct disk_accounting_key', which is a union of bpos.
> + *

First impression.. I'm a little confused why the key type is a union of
bpos. I'm possibly missing something fundamental/obvious, but could you
elaborate more on why that is here?

Brian

> + * This is a type-tagged union of all our various subtypes; a disk accounting
> + * key can be device counters, replicas counters, et cetera - it's extensible.
> + *
> + * The value is a list of u64s or s64s; the number of counters is specific to a
> + * given accounting type.
> + *
> + * Unlike with other key types, updates are _deltas_, and the deltas are not
> + * resolved until the update to the underlying btree, done by btree write buffer
> + * flush or journal replay.
> + *
> + * Journal replay in particular requires special handling. The journal tracks a
> + * range of entries which may possibly have not yet been applied to the btree
> + * yet - it does not know definitively whether individual entries are dirty and
> + * still need to be applied.
> + *
> + * To handle this, we use the version field of struct bkey, and give every
> + * accounting update a unique version number - a total ordering in time; the
> + * version number is derived from the key's position in the journal. Then
> + * journal replay can compare the version number of the key from the journal
> + * with the version number of the key in the btree to determine if a key needs
> + * to be replayed.
> + *
> + * For this to work, we must maintain this strict time ordering of updates as
> + * they are flushed to the btree, both via write buffer flush and via journal
> + * replay. This has complications for the write buffer code while journal replay
> + * is still in progress; the write buffer cannot flush any accounting keys to
> + * the btree until journal replay has finished replaying its accounting keys, or
> + * the (newer) version number of the keys from the write buffer will cause
> + * updates from journal replay to be lost.
> + */
> +
> +struct bch_accounting {
> +	struct bch_val		v;
> +	__u64			d[];
> +};
> +
> +#define BCH_ACCOUNTING_MAX_COUNTERS		3
> +
> +#define BCH_DATA_TYPES()		\
> +	x(free,		0)		\
> +	x(sb,		1)		\
> +	x(journal,	2)		\
> +	x(btree,	3)		\
> +	x(user,		4)		\
> +	x(cached,	5)		\
> +	x(parity,	6)		\
> +	x(stripe,	7)		\
> +	x(need_gc_gens,	8)		\
> +	x(need_discard,	9)
> +
> +enum bch_data_type {
> +#define x(t, n) BCH_DATA_##t,
> +	BCH_DATA_TYPES()
> +#undef x
> +	BCH_DATA_NR
> +};
> +
> +static inline bool data_type_is_empty(enum bch_data_type type)
> +{
> +	switch (type) {
> +	case BCH_DATA_free:
> +	case BCH_DATA_need_gc_gens:
> +	case BCH_DATA_need_discard:
> +		return true;
> +	default:
> +		return false;
> +	}
> +}
> +
> +static inline bool data_type_is_hidden(enum bch_data_type type)
> +{
> +	switch (type) {
> +	case BCH_DATA_sb:
> +	case BCH_DATA_journal:
> +		return true;
> +	default:
> +		return false;
> +	}
> +}
> +
> +#define BCH_DISK_ACCOUNTING_TYPES()		\
> +	x(nr_inodes,		0)		\
> +	x(persistent_reserved,	1)		\
> +	x(replicas,		2)		\
> +	x(dev_data_type,	3)		\
> +	x(dev_stripe_buckets,	4)
> +
> +enum disk_accounting_type {
> +#define x(f, nr)	BCH_DISK_ACCOUNTING_##f	= nr,
> +	BCH_DISK_ACCOUNTING_TYPES()
> +#undef x
> +	BCH_DISK_ACCOUNTING_TYPE_NR,
> +};
> +
> +struct bch_nr_inodes {
> +};
> +
> +struct bch_persistent_reserved {
> +	__u8			nr_replicas;
> +};
> +
> +struct bch_dev_data_type {
> +	__u8			dev;
> +	__u8			data_type;
> +};
> +
> +struct bch_dev_stripe_buckets {
> +	__u8			dev;
> +};
> +
> +struct disk_accounting_key {
> +	union {
> +	struct {
> +		__u8				type;
> +		union {
> +		struct bch_nr_inodes		nr_inodes;
> +		struct bch_persistent_reserved	persistent_reserved;
> +		struct bch_replicas_entry_v1	replicas;
> +		struct bch_dev_data_type	dev_data_type;
> +		struct bch_dev_stripe_buckets	dev_stripe_buckets;
> +		};
> +	};
> +		struct bpos			_pad;
> +	};
> +};
> +
> +#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */
> diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
> new file mode 100644
> index 000000000000..ed94f8c636b3
> --- /dev/null
> +++ b/fs/bcachefs/replicas_format.h
> @@ -0,0 +1,21 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _BCACHEFS_REPLICAS_FORMAT_H
> +#define _BCACHEFS_REPLICAS_FORMAT_H
> +
> +struct bch_replicas_entry_v0 {
> +	__u8			data_type;
> +	__u8			nr_devs;
> +	__u8			devs[];
> +} __packed;
> +
> +struct bch_replicas_entry_v1 {
> +	__u8			data_type;
> +	__u8			nr_devs;
> +	__u8			nr_required;
> +	__u8			devs[];
> +} __packed;
> +
> +#define replicas_entry_bytes(_i)					\
> +	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
> +
> +#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
> diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
> index 3337419faeff..33db8d7ca8c4 100644
> --- a/fs/bcachefs/sb-downgrade.c
> +++ b/fs/bcachefs/sb-downgrade.c
> @@ -52,9 +52,15 @@
>  	  BCH_FSCK_ERR_subvol_fs_path_parent_wrong)		\
>  	x(btree_subvolume_children,				\
>  	  BIT_ULL(BCH_RECOVERY_PASS_check_subvols),		\
> -	  BCH_FSCK_ERR_subvol_children_not_set)
> +	  BCH_FSCK_ERR_subvol_children_not_set)			\
> +	x(disk_accounting_v2,					\
> +	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
> +	  BCH_FSCK_ERR_accounting_mismatch)
>  
> -#define DOWNGRADE_TABLE()
> +#define DOWNGRADE_TABLE()					\
> +	x(disk_accounting_v2,					\
> +	  BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info),		\
> +	  BCH_FSCK_ERR_dev_usage_buckets_wrong)
>  
>  struct upgrade_downgrade_entry {
>  	u64		recovery_passes;
> @@ -108,7 +114,7 @@ void bch2_sb_set_upgrade(struct bch_fs *c,
>  		}
>  }
>  
> -#define x(ver, passes, ...) static const u16 downgrade_ver_##errors[] = { __VA_ARGS__ };
> +#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
>  DOWNGRADE_TABLE()
>  #undef x
>  
> diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
> index 0df4b0e7071a..383e13711001 100644
> --- a/fs/bcachefs/sb-errors_types.h
> +++ b/fs/bcachefs/sb-errors_types.h
> @@ -264,7 +264,8 @@
>  	x(subvol_children_not_set,				256)	\
>  	x(subvol_children_bad,					257)	\
>  	x(subvol_loop,						258)	\
> -	x(subvol_unreachable,					259)
> +	x(subvol_unreachable,					259)	\
> +	x(accounting_mismatch,					260)
>  
>  enum bch_sb_error_id {
>  #define x(t, n) BCH_FSCK_ERR_##t = n,
> -- 
> 2.43.0
>
  
Kent Overstreet Feb. 28, 2024, 7:39 p.m. UTC | #2
On Tue, Feb 27, 2024 at 10:49:19AM -0500, Brian Foster wrote:
> On Sat, Feb 24, 2024 at 09:38:03PM -0500, Kent Overstreet wrote:
> > New key type for the disk space accounting rewrite.
> > 
> >  - Holds a variable sized array of u64s (may be more than one for
> >    accounting e.g. compressed and uncompressed size, or buckets and
> >    sectors for a given data type)
> > 
> >  - Updates are deltas, not new versions of the key: this means updates
> >    to accounting can happen via the btree write buffer, which we'll be
> >    teaching to accumulate deltas.
> > 
> > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> > ---
> >  fs/bcachefs/Makefile                 |   3 +-
> >  fs/bcachefs/bcachefs.h               |   1 +
> >  fs/bcachefs/bcachefs_format.h        |  80 +++------------
> >  fs/bcachefs/bkey_methods.c           |   1 +
> >  fs/bcachefs/disk_accounting.c        |  70 ++++++++++++++
> >  fs/bcachefs/disk_accounting.h        |  52 ++++++++++
> >  fs/bcachefs/disk_accounting_format.h | 139 +++++++++++++++++++++++++++
> >  fs/bcachefs/replicas_format.h        |  21 ++++
> >  fs/bcachefs/sb-downgrade.c           |  12 ++-
> >  fs/bcachefs/sb-errors_types.h        |   3 +-
> >  10 files changed, 311 insertions(+), 71 deletions(-)
> >  create mode 100644 fs/bcachefs/disk_accounting.c
> >  create mode 100644 fs/bcachefs/disk_accounting.h
> >  create mode 100644 fs/bcachefs/disk_accounting_format.h
> >  create mode 100644 fs/bcachefs/replicas_format.h
> > 
> ...
> > diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
> > new file mode 100644
> > index 000000000000..e06a42f0d578
> > --- /dev/null
> > +++ b/fs/bcachefs/disk_accounting_format.h
> > @@ -0,0 +1,139 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
> > +#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
> > +
> > +#include "replicas_format.h"
> > +
> > +/*
> > + * Disk accounting - KEY_TYPE_accounting - on disk format:
> > + *
> > + * Here, the key has considerably more structure than a typical key (bpos); an
> > + * accounting key is 'struct disk_accounting_key', which is a union of bpos.
> > + *
> 
> First impression.. I'm a little confused why the key type is a union of
> bpos. I'm possibly missing something fundamental/obvious, but could you
> elaborate more on why that is here?

How's this?

 * More specifically: a key is just a muliword integer (where word endianness   
 * matches native byte order), so we're treating bpos as an opaque 20 byte                                                                                       
 * integer and mapping bch_accounting_key to that.
  
Brian Foster Feb. 29, 2024, 6:43 p.m. UTC | #3
On Wed, Feb 28, 2024 at 02:39:38PM -0500, Kent Overstreet wrote:
> On Tue, Feb 27, 2024 at 10:49:19AM -0500, Brian Foster wrote:
> > On Sat, Feb 24, 2024 at 09:38:03PM -0500, Kent Overstreet wrote:
> > > New key type for the disk space accounting rewrite.
> > > 
> > >  - Holds a variable sized array of u64s (may be more than one for
> > >    accounting e.g. compressed and uncompressed size, or buckets and
> > >    sectors for a given data type)
> > > 
> > >  - Updates are deltas, not new versions of the key: this means updates
> > >    to accounting can happen via the btree write buffer, which we'll be
> > >    teaching to accumulate deltas.
> > > 
> > > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> > > ---
> > >  fs/bcachefs/Makefile                 |   3 +-
> > >  fs/bcachefs/bcachefs.h               |   1 +
> > >  fs/bcachefs/bcachefs_format.h        |  80 +++------------
> > >  fs/bcachefs/bkey_methods.c           |   1 +
> > >  fs/bcachefs/disk_accounting.c        |  70 ++++++++++++++
> > >  fs/bcachefs/disk_accounting.h        |  52 ++++++++++
> > >  fs/bcachefs/disk_accounting_format.h | 139 +++++++++++++++++++++++++++
> > >  fs/bcachefs/replicas_format.h        |  21 ++++
> > >  fs/bcachefs/sb-downgrade.c           |  12 ++-
> > >  fs/bcachefs/sb-errors_types.h        |   3 +-
> > >  10 files changed, 311 insertions(+), 71 deletions(-)
> > >  create mode 100644 fs/bcachefs/disk_accounting.c
> > >  create mode 100644 fs/bcachefs/disk_accounting.h
> > >  create mode 100644 fs/bcachefs/disk_accounting_format.h
> > >  create mode 100644 fs/bcachefs/replicas_format.h
> > > 
> > ...
> > > diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
> > > new file mode 100644
> > > index 000000000000..e06a42f0d578
> > > --- /dev/null
> > > +++ b/fs/bcachefs/disk_accounting_format.h
> > > @@ -0,0 +1,139 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > +#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
> > > +#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
> > > +
> > > +#include "replicas_format.h"
> > > +
> > > +/*
> > > + * Disk accounting - KEY_TYPE_accounting - on disk format:
> > > + *
> > > + * Here, the key has considerably more structure than a typical key (bpos); an
> > > + * accounting key is 'struct disk_accounting_key', which is a union of bpos.
> > > + *
> > 
> > First impression.. I'm a little confused why the key type is a union of
> > bpos. I'm possibly missing something fundamental/obvious, but could you
> > elaborate more on why that is here?
> 
> How's this?
> 
>  * More specifically: a key is just a muliword integer (where word endianness   
>  * matches native byte order), so we're treating bpos as an opaque 20 byte                                                                                  
>  * integer and mapping bch_accounting_key to that.
> 

Hmm.. I think the connection I missed on first look is basically
disk_accounting_key_to_bpos(). I think what is confusing is that calling
this a key makes me think of bkey, which I understand to contain a bpos,
so then overlaying it with a bpos didn't really make a lot of sense to
me conceptually.

So when I look at disk_accounting_key_to_bpos(), I see we are actually
using the bpos _pad field, and this structure basically _is_ the bpos
for a disk accounting btree bkey. So that kind of makes me wonder why
this isn't called something like disk_accounting_pos instead of _key,
but maybe that is wrong for other reasons.

Either way, what I'm trying to get at is that I think this documentation
would be better if it explained conceptually how disk_accounting_key
relates to bkey/bpos, and why it exists separately from bkey vs. other
key types, rather than (or at least before) getting into the lower level
side effects of a union with bpos.

Brian
  
Kent Overstreet Feb. 29, 2024, 9:24 p.m. UTC | #4
On Thu, Feb 29, 2024 at 01:43:15PM -0500, Brian Foster wrote:
> Hmm.. I think the connection I missed on first look is basically
> disk_accounting_key_to_bpos(). I think what is confusing is that calling
> this a key makes me think of bkey, which I understand to contain a bpos,
> so then overlaying it with a bpos didn't really make a lot of sense to
> me conceptually.
>
> So when I look at disk_accounting_key_to_bpos(), I see we are actually
> using the bpos _pad field, and this structure basically _is_ the bpos
> for a disk accounting btree bkey. So that kind of makes me wonder why
> this isn't called something like disk_accounting_pos instead of _key,
> but maybe that is wrong for other reasons.

hmm, I didn't consider calling it disk_accounting_pos. I'll let that
roll around in my brain.

'key' is more standard terminology to me outside bcachefs, but 'pos'
does make more sense within bcachefs.

> Either way, what I'm trying to get at is that I think this documentation
> would be better if it explained conceptually how disk_accounting_key
> relates to bkey/bpos, and why it exists separately from bkey vs. other
> key types, rather than (or at least before) getting into the lower level
> side effects of a union with bpos.

Well, that gets into some fun territory - ideally bpos would not be a
fixed thing that every btree was forced to use, we'd be able to define
different types per btree.

And we're actually going to need to be able to do that in order to do
configurationless autotiering - i.e. tracking how hot/cold data is on an
inode:offset basis, because LRU btree backreferences need to go in the
key (bpos), not the value, in order to avoid collisions, and bpos isn't
big enough for that.

disk_accounting_(key|pos) is an even trickier situation, because of
endianness issues. The trick we do with bpos of defining the field order
differently based on endianness so that byte order matches word order -
that really wouldn't work here, so there is at present no practical way
that I know of to avoid the byte swabbing when going back and forth
between bpos and disk_accounting_pos on big endian.

But gcc does have an attribute now that lets you specify that an integer
struct member is big or little endian... I if we could get them to go
one step further and give us an attribute to control whether members are
laid out in ascending or descending order...
  
Brian Foster March 1, 2024, 3:03 p.m. UTC | #5
On Thu, Feb 29, 2024 at 04:24:37PM -0500, Kent Overstreet wrote:
> On Thu, Feb 29, 2024 at 01:43:15PM -0500, Brian Foster wrote:
> > Hmm.. I think the connection I missed on first look is basically
> > disk_accounting_key_to_bpos(). I think what is confusing is that calling
> > this a key makes me think of bkey, which I understand to contain a bpos,
> > so then overlaying it with a bpos didn't really make a lot of sense to
> > me conceptually.
> >
> > So when I look at disk_accounting_key_to_bpos(), I see we are actually
> > using the bpos _pad field, and this structure basically _is_ the bpos
> > for a disk accounting btree bkey. So that kind of makes me wonder why
> > this isn't called something like disk_accounting_pos instead of _key,
> > but maybe that is wrong for other reasons.
> 
> hmm, I didn't consider calling it disk_accounting_pos. I'll let that
> roll around in my brain.
> 
> 'key' is more standard terminology to me outside bcachefs, but 'pos'
> does make more sense within bcachefs.
> 

Ok, so I'm not totally crazy at least. :)

Note again that wasn't an explicit suggestion, just that it seems more
logical to me based on my current understanding. I'm just trying to put
down my initial thoughts/confusions in hopes that at least some of this
triggers ideas for improvements...

> > Either way, what I'm trying to get at is that I think this documentation
> > would be better if it explained conceptually how disk_accounting_key
> > relates to bkey/bpos, and why it exists separately from bkey vs. other
> > key types, rather than (or at least before) getting into the lower level
> > side effects of a union with bpos.
> 
> Well, that gets into some fun territory - ideally bpos would not be a
> fixed thing that every btree was forced to use, we'd be able to define
> different types per btree.
> 

Ok, but this starts to sound orthogonal to the accounting bits. Since I
don't really grok why this is called a key, here's how I would add to
the existing documentation:

"Here, the key has considerably more structure than a typical key
(bpos); an accounting key is 'struct disk_accounting_key', which is a
union of bpos. We do this because disk_account_key actually is bpos for
the related bkey that ends up in the accounting btree.

This btree uses nontraditional bpos semantics because accounting btree
keys are indexed differently <reasons based on the counter
structures..?>. Yadda yadda..

Unlike with other key types, <continued existing comment> ...
"

Hm?

Brian

> And we're actually going to need to be able to do that in order to do
> configurationless autotiering - i.e. tracking how hot/cold data is on an
> inode:offset basis, because LRU btree backreferences need to go in the
> key (bpos), not the value, in order to avoid collisions, and bpos isn't
> big enough for that.
> 
> disk_accounting_(key|pos) is an even trickier situation, because of
> endianness issues. The trick we do with bpos of defining the field order
> differently based on endianness so that byte order matches word order -
> that really wouldn't work here, so there is at present no practical way
> that I know of to avoid the byte swabbing when going back and forth
> between bpos and disk_accounting_pos on big endian.
> 
> But gcc does have an attribute now that lets you specify that an integer
> struct member is big or little endian... I if we could get them to go
> one step further and give us an attribute to control whether members are
> laid out in ascending or descending order...
>
  
Kent Overstreet March 1, 2024, 7:30 p.m. UTC | #6
On Fri, Mar 01, 2024 at 10:03:06AM -0500, Brian Foster wrote:
> On Thu, Feb 29, 2024 at 04:24:37PM -0500, Kent Overstreet wrote:
> > On Thu, Feb 29, 2024 at 01:43:15PM -0500, Brian Foster wrote:
> > > Hmm.. I think the connection I missed on first look is basically
> > > disk_accounting_key_to_bpos(). I think what is confusing is that calling
> > > this a key makes me think of bkey, which I understand to contain a bpos,
> > > so then overlaying it with a bpos didn't really make a lot of sense to
> > > me conceptually.
> > >
> > > So when I look at disk_accounting_key_to_bpos(), I see we are actually
> > > using the bpos _pad field, and this structure basically _is_ the bpos
> > > for a disk accounting btree bkey. So that kind of makes me wonder why
> > > this isn't called something like disk_accounting_pos instead of _key,
> > > but maybe that is wrong for other reasons.
> > 
> > hmm, I didn't consider calling it disk_accounting_pos. I'll let that
> > roll around in my brain.
> > 
> > 'key' is more standard terminology to me outside bcachefs, but 'pos'
> > does make more sense within bcachefs.
> > 
> 
> Ok, so I'm not totally crazy at least. :)
> 
> Note again that wasn't an explicit suggestion, just that it seems more
> logical to me based on my current understanding. I'm just trying to put
> down my initial thoughts/confusions in hopes that at least some of this
> triggers ideas for improvements...

I liked it because it makes the relationship between disk_accounting_pos
and bpos more explicit - they're both the same kind of thing.

> > > Either way, what I'm trying to get at is that I think this documentation
> > > would be better if it explained conceptually how disk_accounting_key
> > > relates to bkey/bpos, and why it exists separately from bkey vs. other
> > > key types, rather than (or at least before) getting into the lower level
> > > side effects of a union with bpos.
> > 
> > Well, that gets into some fun territory - ideally bpos would not be a
> > fixed thing that every btree was forced to use, we'd be able to define
> > different types per btree.
> > 
> 
> Ok, but this starts to sound orthogonal to the accounting bits. Since I
> don't really grok why this is called a key, here's how I would add to
> the existing documentation:
> 
> "Here, the key has considerably more structure than a typical key
> (bpos); an accounting key is 'struct disk_accounting_key', which is a
> union of bpos. We do this because disk_account_key actually is bpos for
> the related bkey that ends up in the accounting btree.
> 
> This btree uses nontraditional bpos semantics because accounting btree
> keys are indexed differently <reasons based on the counter
> structures..?>. Yadda yadda..
> 
> Unlike with other key types, <continued existing comment> ...

I'm just going to go with my latest revision for now, I think it's a
reasonable balance between terse and explanatory:

 * More specifically: a key is just a muliword integer (where word endianness
 * matches native byte order), so we're treating bpos as an opaque 20 byte
 * integer and mapping bch_accounting_pos to that.
  

Patch

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index f42f6d256945..94b2edb4155f 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -27,10 +27,11 @@  bcachefs-y		:=	\
 	checksum.o		\
 	clock.o			\
 	compress.o		\
+	data_update.o		\
 	debug.o			\
 	dirent.o		\
+	disk_accounting.o	\
 	disk_groups.o		\
-	data_update.o		\
 	ec.o			\
 	errcode.o		\
 	error.o			\
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 0bee9dab6068..62812fc1cad0 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -509,6 +509,7 @@  enum gc_phase {
 	GC_PHASE_BTREE_logged_ops,
 	GC_PHASE_BTREE_rebalance_work,
 	GC_PHASE_BTREE_subvolume_children,
+	GC_PHASE_BTREE_accounting,
 
 	GC_PHASE_PENDING_DELETE,
 };
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index bff8750ac0d7..313ca7dc370d 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -416,7 +416,8 @@  static inline void bkey_init(struct bkey *k)
 	x(bucket_gens,		30)			\
 	x(snapshot_tree,	31)			\
 	x(logged_op_truncate,	32)			\
-	x(logged_op_finsert,	33)
+	x(logged_op_finsert,	33)			\
+	x(accounting,		34)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -501,17 +502,19 @@  struct bch_sb_field {
 	x(downgrade,			14)
 
 #include "alloc_background_format.h"
+#include "dirent_format.h"
+#include "disk_accounting_format.h"
 #include "extents_format.h"
-#include "reflink_format.h"
 #include "ec_format.h"
 #include "inode_format.h"
-#include "dirent_format.h"
-#include "xattr_format.h"
-#include "quota_format.h"
 #include "logged_ops_format.h"
+#include "quota_format.h"
+#include "reflink_format.h"
+#include "replicas_format.h"
+#include "sb-counters_format.h"
 #include "snapshot_format.h"
 #include "subvolume_format.h"
-#include "sb-counters_format.h"
+#include "xattr_format.h"
 
 enum bch_sb_field_type {
 #define x(f, nr)	BCH_SB_FIELD_##f = nr,
@@ -680,69 +683,11 @@  LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
 
 /* BCH_SB_FIELD_replicas: */
 
-#define BCH_DATA_TYPES()		\
-	x(free,		0)		\
-	x(sb,		1)		\
-	x(journal,	2)		\
-	x(btree,	3)		\
-	x(user,		4)		\
-	x(cached,	5)		\
-	x(parity,	6)		\
-	x(stripe,	7)		\
-	x(need_gc_gens,	8)		\
-	x(need_discard,	9)
-
-enum bch_data_type {
-#define x(t, n) BCH_DATA_##t,
-	BCH_DATA_TYPES()
-#undef x
-	BCH_DATA_NR
-};
-
-static inline bool data_type_is_empty(enum bch_data_type type)
-{
-	switch (type) {
-	case BCH_DATA_free:
-	case BCH_DATA_need_gc_gens:
-	case BCH_DATA_need_discard:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool data_type_is_hidden(enum bch_data_type type)
-{
-	switch (type) {
-	case BCH_DATA_sb:
-	case BCH_DATA_journal:
-		return true;
-	default:
-		return false;
-	}
-}
-
-struct bch_replicas_entry_v0 {
-	__u8			data_type;
-	__u8			nr_devs;
-	__u8			devs[];
-} __packed;
-
 struct bch_sb_field_replicas_v0 {
 	struct bch_sb_field	field;
 	struct bch_replicas_entry_v0 entries[];
 } __packed __aligned(8);
 
-struct bch_replicas_entry_v1 {
-	__u8			data_type;
-	__u8			nr_devs;
-	__u8			nr_required;
-	__u8			devs[];
-} __packed;
-
-#define replicas_entry_bytes(_i)					\
-	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
-
 struct bch_sb_field_replicas {
 	struct bch_sb_field	field;
 	struct bch_replicas_entry_v1 entries[];
@@ -875,7 +820,8 @@  struct bch_sb_field_downgrade {
 	x(rebalance_work,		BCH_VERSION(1,  3))		\
 	x(member_seq,			BCH_VERSION(1,  4))		\
 	x(subvolume_fs_parent,		BCH_VERSION(1,  5))		\
-	x(btree_subvolume_children,	BCH_VERSION(1,  6))
+	x(btree_subvolume_children,	BCH_VERSION(1,  6))		\
+	x(disk_accounting_v2,		BCH_VERSION(1,  7))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -1525,7 +1471,9 @@  enum btree_id_flags {
 	x(rebalance_work,	18,	BTREE_ID_SNAPSHOT_FIELD,		\
 	  BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))			\
 	x(subvolume_children,	19,	0,					\
-	  BIT_ULL(KEY_TYPE_set))
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(accounting,		20,	BTREE_ID_SNAPSHOT_FIELD,		\
+	  BIT_ULL(KEY_TYPE_accounting))						\
 
 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 5e52684764eb..da25bdd1e8a6 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -7,6 +7,7 @@ 
 #include "btree_types.h"
 #include "alloc_background.h"
 #include "dirent.h"
+#include "disk_accounting.h"
 #include "ec.h"
 #include "error.h"
 #include "extents.h"
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
new file mode 100644
index 000000000000..209f59e87b34
--- /dev/null
+++ b/fs/bcachefs/disk_accounting.c
@@ -0,0 +1,70 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "disk_accounting.h"
+#include "replicas.h"
+
+static const char * const disk_accounting_type_strs[] = {
+#define x(t, n, ...) [n] = #t,
+	BCH_DISK_ACCOUNTING_TYPES()
+#undef x
+	NULL
+};
+
+int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k,
+			    enum bkey_invalid_flags flags,
+			    struct printbuf *err)
+{
+	return 0;
+}
+
+void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_key *k)
+{
+	if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) {
+		prt_printf(out, "unknown type %u", k->type);
+		return;
+	}
+
+	prt_str(out, disk_accounting_type_strs[k->type]);
+	prt_str(out, " ");
+
+	switch (k->type) {
+	case BCH_DISK_ACCOUNTING_nr_inodes:
+		break;
+	case BCH_DISK_ACCOUNTING_persistent_reserved:
+		prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas);
+		break;
+	case BCH_DISK_ACCOUNTING_replicas:
+		bch2_replicas_entry_to_text(out, &k->replicas);
+		break;
+	case BCH_DISK_ACCOUNTING_dev_data_type:
+		prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev);
+		bch2_prt_data_type(out, k->dev_data_type.data_type);
+		break;
+	case BCH_DISK_ACCOUNTING_dev_stripe_buckets:
+		prt_printf(out, "dev=%u", k->dev_stripe_buckets.dev);
+		break;
+	}
+}
+
+void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k);
+	struct disk_accounting_key acc_k;
+	bpos_to_disk_accounting_key(&acc_k, k.k->p);
+
+	bch2_accounting_key_to_text(out, &acc_k);
+
+	for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++)
+		prt_printf(out, " %lli", acc.v->d[i]);
+}
+
+void bch2_accounting_swab(struct bkey_s k)
+{
+	for (u64 *p = (u64 *) k.v;
+	     p < (u64 *) bkey_val_end(k);
+	     p++)
+		*p = swab64(*p);
+}
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
new file mode 100644
index 000000000000..e15299665859
--- /dev/null
+++ b/fs/bcachefs/disk_accounting.h
@@ -0,0 +1,52 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_H
+#define _BCACHEFS_DISK_ACCOUNTING_H
+
+static inline unsigned bch2_accounting_counters(const struct bkey *k)
+{
+	return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64);
+}
+
+static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
+					      struct bkey_s_c_accounting src)
+{
+	EBUG_ON(dst->k.u64s != src.k->u64s);
+
+	for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
+		dst->v.d[i] += src.v->d[i];
+	if (bversion_cmp(dst->k.version, src.k->version) < 0)
+		dst->k.version = src.k->version;
+}
+
+static inline void bpos_to_disk_accounting_key(struct disk_accounting_key *acc, struct bpos p)
+{
+	acc->_pad = p;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	bch2_bpos_swab(&acc->_pad);
+#endif
+}
+
+static inline struct bpos disk_accounting_key_to_bpos(struct disk_accounting_key *k)
+{
+	struct bpos ret = k->_pad;
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	bch2_bpos_swab(&ret);
+#endif
+	return ret;
+}
+
+int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c,
+			    enum bkey_invalid_flags, struct printbuf *);
+void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_key *);
+void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_accounting_swab(struct bkey_s);
+
+#define bch2_bkey_ops_accounting ((struct bkey_ops) {	\
+	.key_invalid	= bch2_accounting_invalid,	\
+	.val_to_text	= bch2_accounting_to_text,	\
+	.swab		= bch2_accounting_swab,		\
+	.min_val_size	= 8,				\
+})
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_H */
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
new file mode 100644
index 000000000000..e06a42f0d578
--- /dev/null
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -0,0 +1,139 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
+#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
+
+#include "replicas_format.h"
+
+/*
+ * Disk accounting - KEY_TYPE_accounting - on disk format:
+ *
+ * Here, the key has considerably more structure than a typical key (bpos); an
+ * accounting key is 'struct disk_accounting_key', which is a union of bpos.
+ *
+ * This is a type-tagged union of all our various subtypes; a disk accounting
+ * key can be device counters, replicas counters, et cetera - it's extensible.
+ *
+ * The value is a list of u64s or s64s; the number of counters is specific to a
+ * given accounting type.
+ *
+ * Unlike with other key types, updates are _deltas_, and the deltas are not
+ * resolved until the update to the underlying btree, done by btree write buffer
+ * flush or journal replay.
+ *
+ * Journal replay in particular requires special handling. The journal tracks a
+ * range of entries which may possibly have not yet been applied to the btree
+ * yet - it does not know definitively whether individual entries are dirty and
+ * still need to be applied.
+ *
+ * To handle this, we use the version field of struct bkey, and give every
+ * accounting update a unique version number - a total ordering in time; the
+ * version number is derived from the key's position in the journal. Then
+ * journal replay can compare the version number of the key from the journal
+ * with the version number of the key in the btree to determine if a key needs
+ * to be replayed.
+ *
+ * For this to work, we must maintain this strict time ordering of updates as
+ * they are flushed to the btree, both via write buffer flush and via journal
+ * replay. This has complications for the write buffer code while journal replay
+ * is still in progress; the write buffer cannot flush any accounting keys to
+ * the btree until journal replay has finished replaying its accounting keys, or
+ * the (newer) version number of the keys from the write buffer will cause
+ * updates from journal replay to be lost.
+ */
+
+struct bch_accounting {
+	struct bch_val		v;
+	__u64			d[];
+};
+
+#define BCH_ACCOUNTING_MAX_COUNTERS		3
+
+#define BCH_DATA_TYPES()		\
+	x(free,		0)		\
+	x(sb,		1)		\
+	x(journal,	2)		\
+	x(btree,	3)		\
+	x(user,		4)		\
+	x(cached,	5)		\
+	x(parity,	6)		\
+	x(stripe,	7)		\
+	x(need_gc_gens,	8)		\
+	x(need_discard,	9)
+
+enum bch_data_type {
+#define x(t, n) BCH_DATA_##t,
+	BCH_DATA_TYPES()
+#undef x
+	BCH_DATA_NR
+};
+
+static inline bool data_type_is_empty(enum bch_data_type type)
+{
+	switch (type) {
+	case BCH_DATA_free:
+	case BCH_DATA_need_gc_gens:
+	case BCH_DATA_need_discard:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool data_type_is_hidden(enum bch_data_type type)
+{
+	switch (type) {
+	case BCH_DATA_sb:
+	case BCH_DATA_journal:
+		return true;
+	default:
+		return false;
+	}
+}
+
+#define BCH_DISK_ACCOUNTING_TYPES()		\
+	x(nr_inodes,		0)		\
+	x(persistent_reserved,	1)		\
+	x(replicas,		2)		\
+	x(dev_data_type,	3)		\
+	x(dev_stripe_buckets,	4)
+
+enum disk_accounting_type {
+#define x(f, nr)	BCH_DISK_ACCOUNTING_##f	= nr,
+	BCH_DISK_ACCOUNTING_TYPES()
+#undef x
+	BCH_DISK_ACCOUNTING_TYPE_NR,
+};
+
+struct bch_nr_inodes {
+};
+
+struct bch_persistent_reserved {
+	__u8			nr_replicas;
+};
+
+struct bch_dev_data_type {
+	__u8			dev;
+	__u8			data_type;
+};
+
+struct bch_dev_stripe_buckets {
+	__u8			dev;
+};
+
+struct disk_accounting_key {
+	union {
+	struct {
+		__u8				type;
+		union {
+		struct bch_nr_inodes		nr_inodes;
+		struct bch_persistent_reserved	persistent_reserved;
+		struct bch_replicas_entry_v1	replicas;
+		struct bch_dev_data_type	dev_data_type;
+		struct bch_dev_stripe_buckets	dev_stripe_buckets;
+		};
+	};
+		struct bpos			_pad;
+	};
+};
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */
diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
new file mode 100644
index 000000000000..ed94f8c636b3
--- /dev/null
+++ b/fs/bcachefs/replicas_format.h
@@ -0,0 +1,21 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_FORMAT_H
+#define _BCACHEFS_REPLICAS_FORMAT_H
+
+struct bch_replicas_entry_v0 {
+	__u8			data_type;
+	__u8			nr_devs;
+	__u8			devs[];
+} __packed;
+
+struct bch_replicas_entry_v1 {
+	__u8			data_type;
+	__u8			nr_devs;
+	__u8			nr_required;
+	__u8			devs[];
+} __packed;
+
+#define replicas_entry_bytes(_i)					\
+	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+
+#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 3337419faeff..33db8d7ca8c4 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -52,9 +52,15 @@ 
 	  BCH_FSCK_ERR_subvol_fs_path_parent_wrong)		\
 	x(btree_subvolume_children,				\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_subvols),		\
-	  BCH_FSCK_ERR_subvol_children_not_set)
+	  BCH_FSCK_ERR_subvol_children_not_set)			\
+	x(disk_accounting_v2,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_accounting_mismatch)
 
-#define DOWNGRADE_TABLE()
+#define DOWNGRADE_TABLE()					\
+	x(disk_accounting_v2,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info),		\
+	  BCH_FSCK_ERR_dev_usage_buckets_wrong)
 
 struct upgrade_downgrade_entry {
 	u64		recovery_passes;
@@ -108,7 +114,7 @@  void bch2_sb_set_upgrade(struct bch_fs *c,
 		}
 }
 
-#define x(ver, passes, ...) static const u16 downgrade_ver_##errors[] = { __VA_ARGS__ };
+#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
 DOWNGRADE_TABLE()
 #undef x
 
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index 0df4b0e7071a..383e13711001 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -264,7 +264,8 @@ 
 	x(subvol_children_not_set,				256)	\
 	x(subvol_children_bad,					257)	\
 	x(subvol_loop,						258)	\
-	x(subvol_unreachable,					259)
+	x(subvol_unreachable,					259)	\
+	x(accounting_mismatch,					260)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,