new file mode 100644
@@ -0,0 +1,38 @@
+! { dg-do compile }
+! { dg-additional-options "-ffast-math -fdump-tree-slp1" }
+
+ subroutine shell(nx,ny,nz,q,dt,cfl,dx,dy,dz,cfll,gm,Pr,Re)
+ implicit none
+ integer nx,ny,nz,i,j,k
+ real*8 cfl,dx,dy,dz,dt
+ real*8 gm,Re,Pr,cfll,t1,t2,t3,t4,t5,t6,t7,t8,mu
+ real*8 q(5,nx,ny,nz)
+
+ if (cfll.ge.cfl) cfll=cfl
+ t8=0.0d0
+
+ do k=1,nz
+ do j=1,ny
+ do i=1,nx
+ t1=q(1,i,j,k)
+ t2=q(2,i,j,k)/t1
+ t3=q(3,i,j,k)/t1
+ t4=q(4,i,j,k)/t1
+ t5=(gm-1.0d0)*(q(5,i,j,k)-0.5d0*t1*(t2*t2+t3*t3+t4*t4))
+ t6=dSQRT(gm*t5/t1)
+ mu=gm*Pr*(gm*t5/t1)**0.75d0*2.0d0/Re/t1
+ t7=((dabs(t2)+t6)/dx+mu/dx**2)**2 +
+ 1 ((dabs(t3)+t6)/dy+mu/dy**2)**2 +
+ 2 ((dabs(t4)+t6)/dz+mu/dz**2)**2
+ t7=DSQRT(t7)
+ t8=max(t8,t7)
+ enddo
+ enddo
+ enddo
+ dt=cfll / t8
+
+ return
+ end
+
+! We don't have an effective target for reduc_plus_scal optab support
+! { dg-final { scan-tree-dump ".REDUC_PLUS" "slp1" { target x86_64-*-* } } }
@@ -209,6 +209,7 @@ vect_free_slp_instance (slp_instance instance)
vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
SLP_INSTANCE_LOADS (instance).release ();
SLP_INSTANCE_ROOT_STMTS (instance).release ();
+ SLP_INSTANCE_REMAIN_STMTS (instance).release ();
instance->subgraph_entries.release ();
instance->cost_vec.release ();
free (instance);
@@ -3128,6 +3129,16 @@ vect_build_slp_instance (vec_info *vinfo,
" %G", scalar_stmts[i]->stmt);
}
+ /* When a BB reduction doesn't have an even number of lanes
+ strip it down, treating the remaining lane as scalar.
+ ??? Selecting the optimal set of lanes to vectorize would be nice
+ but SLP build for all lanes will fail quickly because we think
+ we're going to need unrolling. */
+ auto_vec<stmt_vec_info> remain;
+ if (kind == slp_inst_kind_bb_reduc
+ && (scalar_stmts.length () & 1))
+ remain.safe_push (scalar_stmts.pop ());
+
/* Build the tree for the SLP instance. */
unsigned int group_size = scalar_stmts.length ();
bool *matches = XALLOCAVEC (bool, group_size);
@@ -3175,6 +3186,10 @@ vect_build_slp_instance (vec_info *vinfo,
SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
SLP_INSTANCE_LOADS (new_instance) = vNULL;
SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
+ if (!remain.is_empty ())
+ SLP_INSTANCE_REMAIN_STMTS (new_instance) = remain.copy ();
+ else
+ SLP_INSTANCE_REMAIN_STMTS (new_instance) = vNULL;
SLP_INSTANCE_KIND (new_instance) = kind;
new_instance->reduc_phis = NULL;
new_instance->cost_vec = vNULL;
@@ -9138,7 +9153,20 @@ vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
gcc_unreachable ();
tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
-
+ if (!SLP_INSTANCE_REMAIN_STMTS (instance).is_empty ())
+ {
+ tree rem_def = NULL_TREE;
+ for (auto rem : SLP_INSTANCE_REMAIN_STMTS (instance))
+ if (!rem_def)
+ rem_def = gimple_get_lhs (rem->stmt);
+ else
+ rem_def = gimple_build (&epilogue, reduc_code,
+ TREE_TYPE (scalar_def),
+ rem_def, gimple_get_lhs (rem->stmt));
+ scalar_def = gimple_build (&epilogue, reduc_code,
+ TREE_TYPE (scalar_def),
+ scalar_def, rem_def);
+ }
gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
@@ -257,6 +257,10 @@ public:
from, NULL otherwise. */
vec<stmt_vec_info> root_stmts;
+ /* For slp_inst_kind_bb_reduc the defs that were not vectorized, NULL
+ otherwise. */
+ vec<stmt_vec_info> remain_stmts;
+
/* The unrolling factor required to vectorized this SLP instance. */
poly_uint64 unrolling_factor;
@@ -285,6 +289,7 @@ public:
#define SLP_INSTANCE_UNROLLING_FACTOR(S) (S)->unrolling_factor
#define SLP_INSTANCE_LOADS(S) (S)->loads
#define SLP_INSTANCE_ROOT_STMTS(S) (S)->root_stmts
+#define SLP_INSTANCE_REMAIN_STMTS(S) (S)->remain_stmts
#define SLP_INSTANCE_KIND(S) (S)->kind
#define SLP_TREE_CHILDREN(S) (S)->children