[12/21] middle-end: Add remaining changes to peeling and vectorizer to support early breaks
Checks
Commit Message
Hi All,
This finishes wiring that didn't fit in any of the other patches.
Essentially just adding related changes so peeling for early break works.
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
* tree-vect-loop-manip.cc (vect_set_loop_condition_normal,
vect_do_peeling): Support early breaks.
* tree-vect-loop.cc (vect_need_peeling_or_partial_vectors_p): Likewise.
* tree-vectorizer.cc (pass_vectorize::execute): Check all exits.
--- inline copy of patch --
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index eef2bb50c1505f5cf802d5d80300affc2cbe69f6..9c1405d79fd8fe8689007df3b7605b7a3d3ecdd7 100644
--
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index eef2bb50c1505f5cf802d5d80300affc2cbe69f6..9c1405d79fd8fe8689007df3b7605b7a3d3ecdd7 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1187,7 +1187,7 @@ vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
loop handles exactly VF scalars per iteration. */
static gcond *
-vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
+vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge exit_edge,
class loop *loop, tree niters, tree step,
tree final_iv, bool niters_maybe_zero,
gimple_stmt_iterator loop_cond_gsi)
@@ -1296,7 +1296,8 @@ vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
/* Record the number of latch iterations. */
- if (limit == niters)
+ if (limit == niters
+ || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
/* Case A: the loop iterates NITERS times. Subtract one to get the
latch count. */
loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
@@ -3242,6 +3243,16 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
bound_epilog += vf - 1;
if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
bound_epilog += 1;
+
+ /* For early breaks the scalar loop needs to execute at most VF times
+ to find the element that caused the break. */
+ if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+ {
+ bound_epilog = vf;
+ /* Force a scalar epilogue as we can't vectorize the index finding. */
+ vect_epilogues = false;
+ }
+
bool epilog_peeling = maybe_ne (bound_epilog, 0U);
poly_uint64 bound_scalar = bound_epilog;
@@ -3376,14 +3387,23 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
bound_prolog + bound_epilog)
: (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
|| vect_epilogues));
+
+ /* We only support early break vectorization on known bounds at this time.
+ This means that if the vector loop can't be entered then we won't generate
+ it at all. So for now force skip_vector off because the additional control
+ flow messes with the BB exits and we've already analyzed them. */
+ skip_vector = skip_vector && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
+
/* Epilog loop must be executed if the number of iterations for epilog
loop is known at compile time, otherwise we need to add a check at
the end of vector loop and skip to the end of epilog loop. */
bool skip_epilog = (prolog_peeling < 0
|| !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
|| !vf.is_constant ());
- /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
- if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+ /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog
+ loop must be executed. */
+ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+ || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
skip_epilog = false;
class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 55d6aee3d29151e6b528f6fdde15c693e5bdd847..51a054c5b035ac80dfbbf3b5ba2f6da82fda91f6 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -1236,6 +1236,14 @@ vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
(loop_vinfo));
+ /* When we have multiple exits and VF is unknown, we must require partial
+ vectors because the loop bounds is not a minimum but a maximum. That is to
+ say we cannot unpredicate the main loop unless we peel or use partial
+ vectors in the epilogue. */
+ if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+ && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
+ return true;
+
if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
&& LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
{
@@ -3149,7 +3157,8 @@ start_over:
/* If an epilogue loop is required make sure we can create one. */
if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
- || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
+ || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
+ || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc
index d97e2b54c25ac60378935392aa7b73476efed74b..8b495fc7ca137109fc0097c77837233c660e032a 100644
--- a/gcc/tree-vectorizer.cc
+++ b/gcc/tree-vectorizer.cc
@@ -1381,7 +1381,9 @@ pass_vectorize::execute (function *fun)
predicates that need to be shared for optimal predicate usage.
However reassoc will re-order them and prevent CSE from working
as it should. CSE only the loop body, not the entry. */
- bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index);
+ auto_vec<edge> exits = get_loop_exit_edges (loop);
+ for (edge exit : exits)
+ bitmap_set_bit (exit_bbs, exit->dest->index);
edge entry = EDGE_PRED (loop_preheader_edge (loop)->src, 0);
do_rpo_vn (fun, entry, exit_bbs);
Comments
On Mon, 6 Nov 2023, Tamar Christina wrote:
> Hi All,
>
> This finishes wiring that didn't fit in any of the other patches.
> Essentially just adding related changes so peeling for early break works.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * tree-vect-loop-manip.cc (vect_set_loop_condition_normal,
> vect_do_peeling): Support early breaks.
> * tree-vect-loop.cc (vect_need_peeling_or_partial_vectors_p): Likewise.
> * tree-vectorizer.cc (pass_vectorize::execute): Check all exits.
>
> --- inline copy of patch --
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index eef2bb50c1505f5cf802d5d80300affc2cbe69f6..9c1405d79fd8fe8689007df3b7605b7a3d3ecdd7 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -1187,7 +1187,7 @@ vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
> loop handles exactly VF scalars per iteration. */
>
> static gcond *
> -vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
> +vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge exit_edge,
> class loop *loop, tree niters, tree step,
> tree final_iv, bool niters_maybe_zero,
> gimple_stmt_iterator loop_cond_gsi)
> @@ -1296,7 +1296,8 @@ vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
> gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
>
> /* Record the number of latch iterations. */
> - if (limit == niters)
> + if (limit == niters
> + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> /* Case A: the loop iterates NITERS times. Subtract one to get the
> latch count. */
> loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
> @@ -3242,6 +3243,16 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
> bound_epilog += vf - 1;
> if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
> bound_epilog += 1;
> +
> + /* For early breaks the scalar loop needs to execute at most VF times
> + to find the element that caused the break. */
> + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> + {
> + bound_epilog = vf;
> + /* Force a scalar epilogue as we can't vectorize the index finding. */
> + vect_epilogues = false;
This is originally initialized with
bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
so I think we should avoid filling that with LOOP_VINFO_EARLY_BREAKS
rather than fixing up after the fact? That is in vect_analyze_loop
adjust
/* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
enabled, SIMDUID is not set, it is the innermost loop and we have
either already found the loop's SIMDLEN or there was no SIMDLEN to
begin with.
TODO: Enable epilogue vectorization for loops with SIMDUID set. */
bool vect_epilogues = (!simdlen
&& loop->inner == NULL
&& param_vect_epilogues_nomask
&& LOOP_VINFO_PEELING_FOR_NITER
(first_loop_vinfo)
&& !loop->simduid);
and add !LOOP_VINFO_EARLY_BREAKS?
> + }
> +
> bool epilog_peeling = maybe_ne (bound_epilog, 0U);
> poly_uint64 bound_scalar = bound_epilog;
>
> @@ -3376,14 +3387,23 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
> bound_prolog + bound_epilog)
> : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
> || vect_epilogues));
> +
> + /* We only support early break vectorization on known bounds at this time.
> + This means that if the vector loop can't be entered then we won't generate
> + it at all. So for now force skip_vector off because the additional control
> + flow messes with the BB exits and we've already analyzed them. */
> + skip_vector = skip_vector && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
> +
bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
bound_prolog + bound_epilog)
: (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
|| vect_epilogues));
to me that looks like
gcc_assert (!skip_vector || !LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
should work? You are basically relying on cost modeling rejecting
vectorization that doesn't enter the vector loop.
> /* Epilog loop must be executed if the number of iterations for epilog
> loop is known at compile time, otherwise we need to add a check at
> the end of vector loop and skip to the end of epilog loop. */
> bool skip_epilog = (prolog_peeling < 0
> || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> || !vf.is_constant ());
> - /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
> - if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
> + /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog
> + loop must be executed. */
> + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
> + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> skip_epilog = false;
>
> class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 55d6aee3d29151e6b528f6fdde15c693e5bdd847..51a054c5b035ac80dfbbf3b5ba2f6da82fda91f6 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -1236,6 +1236,14 @@ vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
> th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
> (loop_vinfo));
>
> + /* When we have multiple exits and VF is unknown, we must require partial
> + vectors because the loop bounds is not a minimum but a maximum. That is to
> + say we cannot unpredicate the main loop unless we peel or use partial
> + vectors in the epilogue. */
> + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> + && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
> + return true;
> +
I don't quite understand the !.is_constant (), early breaks always
require peeling?
> if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
> {
> @@ -3149,7 +3157,8 @@ start_over:
>
> /* If an epilogue loop is required make sure we can create one. */
> if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
> - || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
> + || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
> + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
> diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc
> index d97e2b54c25ac60378935392aa7b73476efed74b..8b495fc7ca137109fc0097c77837233c660e032a 100644
> --- a/gcc/tree-vectorizer.cc
> +++ b/gcc/tree-vectorizer.cc
> @@ -1381,7 +1381,9 @@ pass_vectorize::execute (function *fun)
> predicates that need to be shared for optimal predicate usage.
> However reassoc will re-order them and prevent CSE from working
> as it should. CSE only the loop body, not the entry. */
> - bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index);
> + auto_vec<edge> exits = get_loop_exit_edges (loop);
> + for (edge exit : exits)
> + bitmap_set_bit (exit_bbs, exit->dest->index);
>
> edge entry = EDGE_PRED (loop_preheader_edge (loop)->src, 0);
> do_rpo_vn (fun, entry, exit_bbs);
Otherwise OK.
Richard.
On Wed, 6 Dec 2023, Tamar Christina wrote:
> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Wednesday, December 6, 2023 8:32 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> > Subject: Re: [PATCH 12/21]middle-end: Add remaining changes to peeling and
> > vectorizer to support early breaks
> >
> > On Mon, 6 Nov 2023, Tamar Christina wrote:
> >
> > > Hi All,
> > >
> > > This finishes wiring that didn't fit in any of the other patches.
> > > Essentially just adding related changes so peeling for early break works.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > * tree-vect-loop-manip.cc (vect_set_loop_condition_normal,
> > > vect_do_peeling): Support early breaks.
> > > * tree-vect-loop.cc (vect_need_peeling_or_partial_vectors_p): Likewise.
> > > * tree-vectorizer.cc (pass_vectorize::execute): Check all exits.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > > index
> > eef2bb50c1505f5cf802d5d80300affc2cbe69f6..9c1405d79fd8fe8689007df3b7
> > 605b7a3d3ecdd7 100644
> > > --- a/gcc/tree-vect-loop-manip.cc
> > > +++ b/gcc/tree-vect-loop-manip.cc
> > > @@ -1187,7 +1187,7 @@ vect_set_loop_condition_partial_vectors_avx512
> > (class loop *loop,
> > > loop handles exactly VF scalars per iteration. */
> > >
> > > static gcond *
> > > -vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge
> > exit_edge,
> > > +vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge exit_edge,
> > > class loop *loop, tree niters, tree step,
> > > tree final_iv, bool niters_maybe_zero,
> > > gimple_stmt_iterator loop_cond_gsi)
> > > @@ -1296,7 +1296,8 @@ vect_set_loop_condition_normal (loop_vec_info /*
> > loop_vinfo */, edge exit_edge,
> > > gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
> > >
> > > /* Record the number of latch iterations. */
> > > - if (limit == niters)
> > > + if (limit == niters
> > > + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > /* Case A: the loop iterates NITERS times. Subtract one to get the
> > > latch count. */
> > > loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
> > > @@ -3242,6 +3243,16 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
> > niters, tree nitersm1,
> > > bound_epilog += vf - 1;
> > > if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
> > > bound_epilog += 1;
> > > +
> > > + /* For early breaks the scalar loop needs to execute at most VF times
> > > + to find the element that caused the break. */
> > > + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > + {
> > > + bound_epilog = vf;
> > > + /* Force a scalar epilogue as we can't vectorize the index finding. */
> > > + vect_epilogues = false;
> >
> > This is originally initialized with
> >
> > bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
> >
> > so I think we should avoid filling that with LOOP_VINFO_EARLY_BREAKS
> > rather than fixing up after the fact? That is in vect_analyze_loop
> > adjust
> >
> > /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
> > enabled, SIMDUID is not set, it is the innermost loop and we have
> > either already found the loop's SIMDLEN or there was no SIMDLEN to
> > begin with.
> > TODO: Enable epilogue vectorization for loops with SIMDUID set. */
> > bool vect_epilogues = (!simdlen
> > && loop->inner == NULL
> > && param_vect_epilogues_nomask
> > && LOOP_VINFO_PEELING_FOR_NITER
> > (first_loop_vinfo)
> > && !loop->simduid);
> >
> > and add !LOOP_VINFO_EARLY_BREAKS?
> >
> > > + }
> > > +
> > > bool epilog_peeling = maybe_ne (bound_epilog, 0U);
> > > poly_uint64 bound_scalar = bound_epilog;
> > >
> > > @@ -3376,14 +3387,23 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
> > niters, tree nitersm1,
> > > bound_prolog + bound_epilog)
> > > : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
> > > || vect_epilogues));
> > > +
> > > + /* We only support early break vectorization on known bounds at this time.
> > > + This means that if the vector loop can't be entered then we won't generate
> > > + it at all. So for now force skip_vector off because the additional control
> > > + flow messes with the BB exits and we've already analyzed them. */
> > > + skip_vector = skip_vector && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
> > > +
> >
> > bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> > ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
> > bound_prolog + bound_epilog)
> > : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
> > || vect_epilogues));
> >
> > to me that looks like
> >
> > gcc_assert (!skip_vector || !LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
> >
> > should work? You are basically relying on cost modeling rejecting
> > vectorization that doesn't enter the vector loop.
> >
> > > /* Epilog loop must be executed if the number of iterations for epilog
> > > loop is known at compile time, otherwise we need to add a check at
> > > the end of vector loop and skip to the end of epilog loop. */
> > > bool skip_epilog = (prolog_peeling < 0
> > > || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> > > || !vf.is_constant ());
> > > - /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
> > > - if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
> > > + /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog
> > > + loop must be executed. */
> > > + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
> > > + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > skip_epilog = false;
> > >
> > > class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
> > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > index
> > 55d6aee3d29151e6b528f6fdde15c693e5bdd847..51a054c5b035ac80dfbbf3b5
> > ba2f6da82fda91f6 100644
> > > --- a/gcc/tree-vect-loop.cc
> > > +++ b/gcc/tree-vect-loop.cc
> > > @@ -1236,6 +1236,14 @@ vect_need_peeling_or_partial_vectors_p
> > (loop_vec_info loop_vinfo)
> > > th = LOOP_VINFO_COST_MODEL_THRESHOLD
> > (LOOP_VINFO_ORIG_LOOP_INFO
> > > (loop_vinfo));
> > >
> > > + /* When we have multiple exits and VF is unknown, we must require partial
> > > + vectors because the loop bounds is not a minimum but a maximum. That is
> > to
> > > + say we cannot unpredicate the main loop unless we peel or use partial
> > > + vectors in the epilogue. */
> > > + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > > + && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
> > > + return true;
> > > +
> >
> > I don't quite understand the !.is_constant (), early breaks always
> > require peeling?
>
> It's mostly to force the use of an unpredicated main loop, since this
> forces LOOP_VINFO_PEELING_FOR_NITER. I can alternatively not
> set it and then mask the individual comparisons in
> vectorizable_early_break. Both work equally, I guess that may be
> better since the assumption is that the break is not taken.
That might be easier to understand, yes. But why not
if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
return true;
? Because even with constant VF we can predicate the main loop
(with AVX512, --param vect-partial-vector-usage=2)
> Any preference?
>
> Thanks,
> Tamar
>
> >
> > > if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> > > && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
> > > {
> > > @@ -3149,7 +3157,8 @@ start_over:
> > >
> > > /* If an epilogue loop is required make sure we can create one. */
> > > if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
> > > - || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
> > > + || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
> > > + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > {
> > > if (dump_enabled_p ())
> > > dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
> > > diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc
> > > index
> > d97e2b54c25ac60378935392aa7b73476efed74b..8b495fc7ca137109fc0097c7
> > 7837233c660e032a 100644
> > > --- a/gcc/tree-vectorizer.cc
> > > +++ b/gcc/tree-vectorizer.cc
> > > @@ -1381,7 +1381,9 @@ pass_vectorize::execute (function *fun)
> > > predicates that need to be shared for optimal predicate usage.
> > > However reassoc will re-order them and prevent CSE from working
> > > as it should. CSE only the loop body, not the entry. */
> > > - bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index);
> > > + auto_vec<edge> exits = get_loop_exit_edges (loop);
> > > + for (edge exit : exits)
> > > + bitmap_set_bit (exit_bbs, exit->dest->index);
> > >
> > > edge entry = EDGE_PRED (loop_preheader_edge (loop)->src, 0);
> > > do_rpo_vn (fun, entry, exit_bbs);
> >
> > Otherwise OK.
> >
> > Richard.
>
@@ -1187,7 +1187,7 @@ vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
loop handles exactly VF scalars per iteration. */
static gcond *
-vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
+vect_set_loop_condition_normal (loop_vec_info loop_vinfo, edge exit_edge,
class loop *loop, tree niters, tree step,
tree final_iv, bool niters_maybe_zero,
gimple_stmt_iterator loop_cond_gsi)
@@ -1296,7 +1296,8 @@ vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
/* Record the number of latch iterations. */
- if (limit == niters)
+ if (limit == niters
+ || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
/* Case A: the loop iterates NITERS times. Subtract one to get the
latch count. */
loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
@@ -3242,6 +3243,16 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
bound_epilog += vf - 1;
if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
bound_epilog += 1;
+
+ /* For early breaks the scalar loop needs to execute at most VF times
+ to find the element that caused the break. */
+ if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+ {
+ bound_epilog = vf;
+ /* Force a scalar epilogue as we can't vectorize the index finding. */
+ vect_epilogues = false;
+ }
+
bool epilog_peeling = maybe_ne (bound_epilog, 0U);
poly_uint64 bound_scalar = bound_epilog;
@@ -3376,14 +3387,23 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
bound_prolog + bound_epilog)
: (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
|| vect_epilogues));
+
+ /* We only support early break vectorization on known bounds at this time.
+ This means that if the vector loop can't be entered then we won't generate
+ it at all. So for now force skip_vector off because the additional control
+ flow messes with the BB exits and we've already analyzed them. */
+ skip_vector = skip_vector && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
+
/* Epilog loop must be executed if the number of iterations for epilog
loop is known at compile time, otherwise we need to add a check at
the end of vector loop and skip to the end of epilog loop. */
bool skip_epilog = (prolog_peeling < 0
|| !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
|| !vf.is_constant ());
- /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
- if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+ /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog
+ loop must be executed. */
+ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+ || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
skip_epilog = false;
class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
@@ -1236,6 +1236,14 @@ vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
(loop_vinfo));
+ /* When we have multiple exits and VF is unknown, we must require partial
+ vectors because the loop bounds is not a minimum but a maximum. That is to
+ say we cannot unpredicate the main loop unless we peel or use partial
+ vectors in the epilogue. */
+ if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+ && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
+ return true;
+
if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
&& LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
{
@@ -3149,7 +3157,8 @@ start_over:
/* If an epilogue loop is required make sure we can create one. */
if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
- || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
+ || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
+ || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
@@ -1381,7 +1381,9 @@ pass_vectorize::execute (function *fun)
predicates that need to be shared for optimal predicate usage.
However reassoc will re-order them and prevent CSE from working
as it should. CSE only the loop body, not the entry. */
- bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index);
+ auto_vec<edge> exits = get_loop_exit_edges (loop);
+ for (edge exit : exits)
+ bitmap_set_bit (exit_bbs, exit->dest->index);
edge entry = EDGE_PRED (loop_preheader_edge (loop)->src, 0);
do_rpo_vn (fun, entry, exit_bbs);