[17/17] check hash table insertions
Checks
Commit Message
On Dec 27, 2022, Alexandre Oliva <oliva@adacore.com> wrote:
> The number of bugs it revealed tells me that leaving callers in charge
> of completing insertions is too error prone. I found this
> verification code a bit too expensive to enable in general.
Here's a relatively cheap, checking-only test to catch dangling inserts.
I've noticed a number of potential problems in hash tables, of three
kinds: insertion of entries that seem empty, dangling insertions, and
lookups during insertions.
These problems may all have the effect of replacing a deleted entry
with one that seems empty, which may disconnect double-hashing chains
involving that entry, and thus cause entries to go missing.
This patch detects such problems by recording a pending insertion and
checking that it's completed before other potentially-conflicting
operations. The additional field is only introduced when checking is
enabled.
Regstrapped on x86_64-linux-gnu. Ok to install?
for gcc/ChnageLog
* hash-table.h (check_complete_insertion, check_insert_slot):
New hash_table methods.
(m_inserting_slot): New hash_table field.
(begin, hash_table ctors, ~hash_table): Check previous insert.
(expand, empty_slow, clear_slot, find_with_hash): Likewise.
(remote_elt_with_hash, traverse_noresize): Likewise.
(gt_pch_nx): Likewise.
(find_slot_with_hash): Likewise. Record requested insert.
---
gcc/hash-table.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 60 insertions(+), 3 deletions(-)
Comments
> Am 28.12.2022 um 13:51 schrieb Alexandre Oliva via Gcc-patches <gcc-patches@gcc.gnu.org>:
>
> On Dec 27, 2022, Alexandre Oliva <oliva@adacore.com> wrote:
>
>> The number of bugs it revealed tells me that leaving callers in charge
>> of completing insertions is too error prone. I found this
>> verification code a bit too expensive to enable in general.
>
> Here's a relatively cheap, checking-only test to catch dangling inserts.
>
>
> I've noticed a number of potential problems in hash tables, of three
> kinds: insertion of entries that seem empty, dangling insertions, and
> lookups during insertions.
>
> These problems may all have the effect of replacing a deleted entry
> with one that seems empty, which may disconnect double-hashing chains
> involving that entry, and thus cause entries to go missing.
>
> This patch detects such problems by recording a pending insertion and
> checking that it's completed before other potentially-conflicting
> operations. The additional field is only introduced when checking is
> enabled.
I wonder if on INSERT, pushing a DELETED marker would fix the dangling insert and search during delete problem be whether that would be better from a design point of view? (It of course requires a DELETED representation)
> Regstrapped on x86_64-linux-gnu. Ok to install?
>
>
> for gcc/ChnageLog
>
> * hash-table.h (check_complete_insertion, check_insert_slot):
> New hash_table methods.
> (m_inserting_slot): New hash_table field.
> (begin, hash_table ctors, ~hash_table): Check previous insert.
> (expand, empty_slow, clear_slot, find_with_hash): Likewise.
> (remote_elt_with_hash, traverse_noresize): Likewise.
> (gt_pch_nx): Likewise.
> (find_slot_with_hash): Likewise. Record requested insert.
> ---
> gcc/hash-table.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 60 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/hash-table.h b/gcc/hash-table.h
> index f4bda6102323e..33753d04b7bdb 100644
> --- a/gcc/hash-table.h
> +++ b/gcc/hash-table.h
> @@ -495,6 +495,7 @@ public:
> {
> if (Lazy && m_entries == NULL)
> return iterator ();
> + check_complete_insertion ();
> iterator iter (m_entries, m_entries + m_size);
> iter.slide ();
> return iter;
> @@ -551,8 +552,39 @@ private:
> Descriptor::mark_empty (v);
> }
>
> +public:
> + void check_complete_insertion () const
> + {
> +#if CHECKING_P
> + if (!m_inserting_slot)
> + return;
> +
> + gcc_checking_assert (m_inserting_slot >= &m_entries[0]
> + && m_inserting_slot < &m_entries[m_size]);
> +
> + if (!is_empty (*m_inserting_slot))
> + m_inserting_slot = NULL;
> + else
> + gcc_unreachable ();
> +#endif
> + }
> +
> +private:
> + value_type *check_insert_slot (value_type *ret)
> + {
> +#if CHECKING_P
> + gcc_checking_assert (is_empty (*ret));
> + m_inserting_slot = ret;
> +#endif
> + return ret;
> + }
> +
> +#if CHECKING_P
> + mutable value_type *m_inserting_slot;
> +#endif
> +
> /* Table itself. */
> - typename Descriptor::value_type *m_entries;
> + value_type *m_entries;
>
> size_t m_size;
>
> @@ -607,6 +639,9 @@ hash_table<Descriptor, Lazy, Allocator>::hash_table (size_t size, bool ggc,
> ATTRIBUTE_UNUSED,
> mem_alloc_origin origin
> MEM_STAT_DECL) :
> +#if CHECKING_P
> + m_inserting_slot (0),
> +#endif
> m_n_elements (0), m_n_deleted (0), m_searches (0), m_collisions (0),
> m_ggc (ggc), m_sanitize_eq_and_hash (sanitize_eq_and_hash)
> #if GATHER_STATISTICS
> @@ -639,6 +674,9 @@ hash_table<Descriptor, Lazy, Allocator>::hash_table (const hash_table &h,
> ATTRIBUTE_UNUSED,
> mem_alloc_origin origin
> MEM_STAT_DECL) :
> +#if CHECKING_P
> + m_inserting_slot (0),
> +#endif
> m_n_elements (h.m_n_elements), m_n_deleted (h.m_n_deleted),
> m_searches (0), m_collisions (0), m_ggc (ggc),
> m_sanitize_eq_and_hash (sanitize_eq_and_hash)
> @@ -646,6 +684,8 @@ hash_table<Descriptor, Lazy, Allocator>::hash_table (const hash_table &h,
> , m_gather_mem_stats (gather_mem_stats)
> #endif
> {
> + h.check_complete_insertion ();
> +
> size_t size = h.m_size;
>
> if (m_gather_mem_stats)
> @@ -675,6 +715,8 @@ template<typename Descriptor, bool Lazy,
> template<typename Type> class Allocator>
> hash_table<Descriptor, Lazy, Allocator>::~hash_table ()
> {
> + check_complete_insertion ();
> +
> if (!Lazy || m_entries)
> {
> for (size_t i = m_size - 1; i < m_size; i--)
> @@ -778,6 +820,8 @@ template<typename Descriptor, bool Lazy,
> void
> hash_table<Descriptor, Lazy, Allocator>::expand ()
> {
> + check_complete_insertion ();
> +
> value_type *oentries = m_entries;
> unsigned int oindex = m_size_prime_index;
> size_t osize = size ();
> @@ -853,6 +897,8 @@ template<typename Descriptor, bool Lazy,
> void
> hash_table<Descriptor, Lazy, Allocator>::empty_slow ()
> {
> + check_complete_insertion ();
> +
> size_t size = m_size;
> size_t nsize = size;
> value_type *entries = m_entries;
> @@ -901,6 +947,8 @@ template<typename Descriptor, bool Lazy,
> void
> hash_table<Descriptor, Lazy, Allocator>::clear_slot (value_type *slot)
> {
> + check_complete_insertion ();
> +
> gcc_checking_assert (!(slot < m_entries || slot >= m_entries + size ()
> || is_empty (*slot) || is_deleted (*slot)));
>
> @@ -927,6 +975,8 @@ hash_table<Descriptor, Lazy, Allocator>
> if (Lazy && m_entries == NULL)
> m_entries = alloc_entries (size);
>
> + check_complete_insertion ();
> +
> #if CHECKING_P
> if (m_sanitize_eq_and_hash)
> verify (comparable, hash);
> @@ -976,6 +1026,8 @@ hash_table<Descriptor, Lazy, Allocator>
> }
> if (insert == INSERT && m_size * 3 <= m_n_elements * 4)
> expand ();
> + else
> + check_complete_insertion ();
>
> #if CHECKING_P
> if (m_sanitize_eq_and_hash)
> @@ -1022,11 +1074,11 @@ hash_table<Descriptor, Lazy, Allocator>
> {
> m_n_deleted--;
> mark_empty (*first_deleted_slot);
> - return first_deleted_slot;
> + return check_insert_slot (first_deleted_slot);
> }
>
> m_n_elements++;
> - return &m_entries[index];
> + return check_insert_slot (&m_entries[index]);
> }
>
> /* Verify that all existing elements in the hash table which are
> @@ -1068,6 +1120,8 @@ void
> hash_table<Descriptor, Lazy, Allocator>
> ::remove_elt_with_hash (const compare_type &comparable, hashval_t hash)
> {
> + check_complete_insertion ();
> +
> value_type *slot = find_slot_with_hash (comparable, hash, NO_INSERT);
> if (slot == NULL)
> return;
> @@ -1094,6 +1148,8 @@ hash_table<Descriptor, Lazy, Allocator>::traverse_noresize (Argument argument)
> if (Lazy && m_entries == NULL)
> return;
>
> + check_complete_insertion ();
> +
> value_type *slot = m_entries;
> value_type *limit = slot + size ();
>
> @@ -1210,6 +1266,7 @@ template<typename D>
> static void
> gt_pch_nx (hash_table<D> *h)
> {
> + h->check_complete_insertion ();
> bool success
> = gt_pch_note_object (h->m_entries, h, hashtab_entry_note_pointers<D>);
> gcc_checking_assert (success);
>
>
> --
> Alexandre Oliva, happy hacker https://FSFLA.org/blogs/lxo/
> Free Software Activist GNU Toolchain Engineer
> Disinformation flourishes because many people care deeply about injustice
> but very few check the facts. Ask me about <https://stallmansupport.org>
On Dec 28, 2022, Richard Biener <richard.guenther@gmail.com> wrote:
> I wonder if on INSERT, pushing a DELETED marker would fix the dangling
> insert and search during delete problem be whether that would be
> better from a design point of view? (It of course requires a DELETED
> representation)
I'm undecided whether a design that rules out the possibility of not
having DELETED would qualify as unequivocally better.
Unless DELETED takes over the NULL representation, and something else is
used for EMPTY, every INSERT point would have to be adjusted to look for
the non-NULL DELETED representation. That alone was enough for me to
rule out going down that path.
If we were to change the INSERT callers, it would make sense to
e.g. return a smart pointer that enforced the completion of the
insertion. But that wouldn't fix lookups during insertion without
requiring a separate DELETED representation.
The one-pending-insertion strategy I implemented was the only one I
could find that addressed all of the pitfalls without significant
overhead. It caught even one case that the mere element-counting had
failed to catch.
> Am 29.12.2022 um 00:06 schrieb Alexandre Oliva <oliva@adacore.com>:
>
> On Dec 28, 2022, Richard Biener <richard.guenther@gmail.com> wrote:
>
>> I wonder if on INSERT, pushing a DELETED marker would fix the dangling
>> insert and search during delete problem be whether that would be
>> better from a design point of view? (It of course requires a DELETED
>> representation)
>
> I'm undecided whether a design that rules out the possibility of not
> having DELETED would qualify as unequivocally better.
>
> Unless DELETED takes over the NULL representation, and something else is
> used for EMPTY, every INSERT point would have to be adjusted to look for
> the non-NULL DELETED representation.
Not sure what you mean - I think turning EMPTY (or DELETED) into DELETED at insert time would make the slot occupied for lookups and thus fix lookup during insert. Of course insert during insert would still fail and possibly return the same slot again. So the most foolproof design would be a new INSERTING representation.
> That alone was enough for me to
> rule out going down that path.
>
> If we were to change the INSERT callers, it would make sense to
> e.g. return a smart pointer that enforced the completion of the
> insertion. But that wouldn't fix lookups during insertion without
> requiring a separate DELETED representation.
>
> The one-pending-insertion strategy I implemented was the only one I
> could find that addressed all of the pitfalls without significant
> overhead. It caught even one case that the mere element-counting had
> failed to catch.
Yes, it’s true that this addresses all issues with just imposing constraints on API use. But the I wondered how you catched the completion of the insertion? (I’ll have to
Dig into the patch to see)
Richard
>
> --
> Alexandre Oliva, happy hacker https://FSFLA.org/blogs/lxo/
> Free Software Activist GNU Toolchain Engineer
> Disinformation flourishes because many people care deeply about injustice
> but very few check the facts. Ask me about <https://stallmansupport.org>
On Dec 29, 2022, Richard Biener <richard.guenther@gmail.com> wrote:
>> Am 29.12.2022 um 00:06 schrieb Alexandre Oliva <oliva@adacore.com>:
>>
>> On Dec 28, 2022, Richard Biener <richard.guenther@gmail.com> wrote:
>>
>>> I wonder if on INSERT, pushing a DELETED marker would fix the dangling
>>> insert and search during delete problem be whether that would be
>>> better from a design point of view? (It of course requires a DELETED
>>> representation)
>>
>> I'm undecided whether a design that rules out the possibility of not
>> having DELETED would qualify as unequivocally better.
>>
>> Unless DELETED takes over the NULL representation, and something else is
>> used for EMPTY, every INSERT point would have to be adjusted to look for
>> the non-NULL DELETED representation.
> Not sure what you mean
I meant existing code tests that dereferencing the returned slot pointer
yields NULL. If we were to change the slot value during insert, we'd
have to adjust all callers.
> I think turning EMPTY (or DELETED) into
> DELETED at insert time would make the slot occupied for lookups and
> thus fix lookup during insert.
Yup.
> Of course insert during insert would
> still fail and possibly return the same slot again.
Yeah.
> So the most foolproof design would be a new INSERTING representation.
There's a glitch there: we can't expand with a pending insert. So we
might need to count inserting nodes separately, or (ideally) be able to
check quickly that insertions have completed.
But then, inserting with a pending insert ought to be caught and
reported, because an insertion invalidates previously-returned slot
pointers, including that of the pending insert. And that's how I got to
the proposed patch.
>> The one-pending-insertion strategy I implemented was the only one I
>> could find that addressed all of the pitfalls without significant
>> overhead. It caught even one case that the mere element-counting had
>> failed to catch.
> Yes, it’s true that this addresses all issues with just imposing
> constraints on API use. But the I wondered how you catched the
> completion of the insertion? (I’ll have to
> Dig into the patch to see)
Record the slot with the pending insert, and at any subsequent operation
(that could be affected by the pending insert), check and report in case
the insertion was not completed.
> Am 30.12.2022 um 09:53 schrieb Alexandre Oliva <oliva@adacore.com>:
>
> On Dec 29, 2022, Richard Biener <richard.guenther@gmail.com> wrote:
>
>>>> Am 29.12.2022 um 00:06 schrieb Alexandre Oliva <oliva@adacore.com>:
>>>
>>> On Dec 28, 2022, Richard Biener <richard.guenther@gmail.com> wrote:
>>>
>>>> I wonder if on INSERT, pushing a DELETED marker would fix the dangling
>>>> insert and search during delete problem be whether that would be
>>>> better from a design point of view? (It of course requires a DELETED
>>>> representation)
>>>
>>> I'm undecided whether a design that rules out the possibility of not
>>> having DELETED would qualify as unequivocally better.
>>>
>>> Unless DELETED takes over the NULL representation, and something else is
>>> used for EMPTY, every INSERT point would have to be adjusted to look for
>>> the non-NULL DELETED representation.
>
>> Not sure what you mean
>
> I meant existing code tests that dereferencing the returned slot pointer
> yields NULL. If we were to change the slot value during insert, we'd
> have to adjust all callers.
>
>> I think turning EMPTY (or DELETED) into
>> DELETED at insert time would make the slot occupied for lookups and
>> thus fix lookup during insert.
>
> Yup.
>
>> Of course insert during insert would
>> still fail and possibly return the same slot again.
>
> Yeah.
>
>> So the most foolproof design would be a new INSERTING representation.
>
> There's a glitch there: we can't expand with a pending insert. So we
> might need to count inserting nodes separately, or (ideally) be able to
> check quickly that insertions have completed.
>
> But then, inserting with a pending insert ought to be caught and
> reported, because an insertion invalidates previously-returned slot
> pointers, including that of the pending insert. And that's how I got to
> the proposed patch.
>
>>> The one-pending-insertion strategy I implemented was the only one I
>>> could find that addressed all of the pitfalls without significant
>>> overhead. It caught even one case that the mere element-counting had
>>> failed to catch.
>
>> Yes, it’s true that this addresses all issues with just imposing
>> constraints on API use. But the I wondered how you catched the
>> completion of the insertion? (I’ll have to
>> Dig into the patch to see)
>
> Record the slot with the pending insert, and at any subsequent operation
> (that could be affected by the pending insert), check and report in case
> the insertion was not completed.
Ah, OK, so the completion is checked at the next conflicting operation. Yeah, that makes sense I guess.
Thus OK (I think Jeff already approved the patch).
Thanks and happy holidays!
Richard
>
>
> --
> Alexandre Oliva, happy hacker https://FSFLA.org/blogs/lxo/
> Free Software Activist GNU Toolchain Engineer
> Disinformation flourishes because many people care deeply about injustice
> but very few check the facts. Ask me about <https://stallmansupport.org>
On Dec 30, 2022, Richard Biener <richard.guenther@gmail.com> wrote:
> Ah, OK, so the completion is checked at the next conflicting
> operation. Yeah, that makes sense I guess.
*nod*
> Thus OK (I think Jeff already approved the patch).
Thanks, 16/ and 17/ were still pending reviews.
I'm installing 17/ now.
> Thanks and happy holidays!
Happy GNU Year! :-)
@@ -495,6 +495,7 @@ public:
{
if (Lazy && m_entries == NULL)
return iterator ();
+ check_complete_insertion ();
iterator iter (m_entries, m_entries + m_size);
iter.slide ();
return iter;
@@ -551,8 +552,39 @@ private:
Descriptor::mark_empty (v);
}
+public:
+ void check_complete_insertion () const
+ {
+#if CHECKING_P
+ if (!m_inserting_slot)
+ return;
+
+ gcc_checking_assert (m_inserting_slot >= &m_entries[0]
+ && m_inserting_slot < &m_entries[m_size]);
+
+ if (!is_empty (*m_inserting_slot))
+ m_inserting_slot = NULL;
+ else
+ gcc_unreachable ();
+#endif
+ }
+
+private:
+ value_type *check_insert_slot (value_type *ret)
+ {
+#if CHECKING_P
+ gcc_checking_assert (is_empty (*ret));
+ m_inserting_slot = ret;
+#endif
+ return ret;
+ }
+
+#if CHECKING_P
+ mutable value_type *m_inserting_slot;
+#endif
+
/* Table itself. */
- typename Descriptor::value_type *m_entries;
+ value_type *m_entries;
size_t m_size;
@@ -607,6 +639,9 @@ hash_table<Descriptor, Lazy, Allocator>::hash_table (size_t size, bool ggc,
ATTRIBUTE_UNUSED,
mem_alloc_origin origin
MEM_STAT_DECL) :
+#if CHECKING_P
+ m_inserting_slot (0),
+#endif
m_n_elements (0), m_n_deleted (0), m_searches (0), m_collisions (0),
m_ggc (ggc), m_sanitize_eq_and_hash (sanitize_eq_and_hash)
#if GATHER_STATISTICS
@@ -639,6 +674,9 @@ hash_table<Descriptor, Lazy, Allocator>::hash_table (const hash_table &h,
ATTRIBUTE_UNUSED,
mem_alloc_origin origin
MEM_STAT_DECL) :
+#if CHECKING_P
+ m_inserting_slot (0),
+#endif
m_n_elements (h.m_n_elements), m_n_deleted (h.m_n_deleted),
m_searches (0), m_collisions (0), m_ggc (ggc),
m_sanitize_eq_and_hash (sanitize_eq_and_hash)
@@ -646,6 +684,8 @@ hash_table<Descriptor, Lazy, Allocator>::hash_table (const hash_table &h,
, m_gather_mem_stats (gather_mem_stats)
#endif
{
+ h.check_complete_insertion ();
+
size_t size = h.m_size;
if (m_gather_mem_stats)
@@ -675,6 +715,8 @@ template<typename Descriptor, bool Lazy,
template<typename Type> class Allocator>
hash_table<Descriptor, Lazy, Allocator>::~hash_table ()
{
+ check_complete_insertion ();
+
if (!Lazy || m_entries)
{
for (size_t i = m_size - 1; i < m_size; i--)
@@ -778,6 +820,8 @@ template<typename Descriptor, bool Lazy,
void
hash_table<Descriptor, Lazy, Allocator>::expand ()
{
+ check_complete_insertion ();
+
value_type *oentries = m_entries;
unsigned int oindex = m_size_prime_index;
size_t osize = size ();
@@ -853,6 +897,8 @@ template<typename Descriptor, bool Lazy,
void
hash_table<Descriptor, Lazy, Allocator>::empty_slow ()
{
+ check_complete_insertion ();
+
size_t size = m_size;
size_t nsize = size;
value_type *entries = m_entries;
@@ -901,6 +947,8 @@ template<typename Descriptor, bool Lazy,
void
hash_table<Descriptor, Lazy, Allocator>::clear_slot (value_type *slot)
{
+ check_complete_insertion ();
+
gcc_checking_assert (!(slot < m_entries || slot >= m_entries + size ()
|| is_empty (*slot) || is_deleted (*slot)));
@@ -927,6 +975,8 @@ hash_table<Descriptor, Lazy, Allocator>
if (Lazy && m_entries == NULL)
m_entries = alloc_entries (size);
+ check_complete_insertion ();
+
#if CHECKING_P
if (m_sanitize_eq_and_hash)
verify (comparable, hash);
@@ -976,6 +1026,8 @@ hash_table<Descriptor, Lazy, Allocator>
}
if (insert == INSERT && m_size * 3 <= m_n_elements * 4)
expand ();
+ else
+ check_complete_insertion ();
#if CHECKING_P
if (m_sanitize_eq_and_hash)
@@ -1022,11 +1074,11 @@ hash_table<Descriptor, Lazy, Allocator>
{
m_n_deleted--;
mark_empty (*first_deleted_slot);
- return first_deleted_slot;
+ return check_insert_slot (first_deleted_slot);
}
m_n_elements++;
- return &m_entries[index];
+ return check_insert_slot (&m_entries[index]);
}
/* Verify that all existing elements in the hash table which are
@@ -1068,6 +1120,8 @@ void
hash_table<Descriptor, Lazy, Allocator>
::remove_elt_with_hash (const compare_type &comparable, hashval_t hash)
{
+ check_complete_insertion ();
+
value_type *slot = find_slot_with_hash (comparable, hash, NO_INSERT);
if (slot == NULL)
return;
@@ -1094,6 +1148,8 @@ hash_table<Descriptor, Lazy, Allocator>::traverse_noresize (Argument argument)
if (Lazy && m_entries == NULL)
return;
+ check_complete_insertion ();
+
value_type *slot = m_entries;
value_type *limit = slot + size ();
@@ -1210,6 +1266,7 @@ template<typename D>
static void
gt_pch_nx (hash_table<D> *h)
{
+ h->check_complete_insertion ();
bool success
= gt_pch_note_object (h->m_entries, h, hashtab_entry_note_pointers<D>);
gcc_checking_assert (success);