[committed,1/6] amdgcn: add multiple vector sizes

Message ID 45381d6f9f4e7b5c7b062f5ad8cc9788091c2d07.1665485382.git.ams@codesourcery.com
State New, archived
Headers
Series amdgcn: Add V32, V16, V8, V4, and V2 vectors |

Commit Message

Andrew Stubbs Oct. 11, 2022, 11:02 a.m. UTC
  The vectors sizes are simulated using implicit masking, but they make life
easier for the autovectorizer and SLP passes.

gcc/ChangeLog:

	* config/gcn/gcn-modes.def (VECTOR_MODE): Add new modes
	V32QI, V32HI, V32SI, V32DI, V32TI, V32HF, V32SF, V32DF,
	V16QI, V16HI, V16SI, V16DI, V16TI, V16HF, V16SF, V16DF,
	V8QI, V8HI, V8SI, V8DI, V8TI, V8HF, V8SF, V8DF,
	V4QI, V4HI, V4SI, V4DI, V4TI, V4HF, V4SF, V4DF,
	V2QI, V2HI, V2SI, V2DI, V2TI, V2HF, V2SF, V2DF.
	(ADJUST_ALIGNMENT): Likewise.
	* config/gcn/gcn-protos.h (gcn_full_exec): Delete.
	(gcn_full_exec_reg): Delete.
	(gcn_scalar_exec): Delete.
	(gcn_scalar_exec_reg): Delete.
	(vgpr_1reg_mode_p): Use inner mode to identify vector registers.
	(vgpr_2reg_mode_p): Likewise.
	(vgpr_vector_mode_p): Use VECTOR_MODE_P.
	* config/gcn/gcn-valu.md (V_QI, V_HI, V_HF, V_SI, V_SF, V_DI, V_DF,
	V_QIHI, V_1REG, V_INT_1REG, V_INT_1REG_ALT, V_FP_1REG, V_2REG, V_noQI,
	V_noHI, V_INT_noQI, V_INT_noHI, V_ALL, V_ALL_ALT, V_INT, V_FP):
	Add additional vector modes.
	(V64_SI, V64_DI, V64_ALL, V64_FP): New iterators.
	(scalar_mode, SCALAR_MODE, vnsi, VnSI, vndi, VnDI, sdwa):
	Add additional vector mode mappings.
	(mov<mode>): Implement vector length conversions.
	(ldexp<mode>3<exec>): Use VnSI.
	(frexp<mode>_exp2<exec>): Likewise.
	(VCVT_MODE, VCVT_FMODE, VCVT_IMODE): Add additional vector modes.
	(reduc_<reduc_op>_scal_<mode>): Use V64_ALL.
	(fold_left_plus_<mode>): Use V64_FP.
	(*<reduc_op>_dpp_shr_<mode>): Use V64_1REG.
	(*<reduc_op>_dpp_shr_<mode>): Use V64_DI.
	(*plus_carry_dpp_shr_<mode>): Use V64_INT_1REG.
	(*plus_carry_in_dpp_shr_<mode>): Use V64_SI.
	(*plus_carry_dpp_shr_<mode>): Use V64_DI.
	(mov_from_lane63_<mode>): Use V64_2REG.
	* config/gcn/gcn.cc (VnMODE): New function.
	(gcn_can_change_mode_class): Support multiple vector sizes.
	(gcn_modes_tieable_p): Likewise.
	(gcn_operand_part): Likewise.
	(gcn_scalar_exec): Delete function.
	(gcn_scalar_exec_reg): Delete function.
	(gcn_full_exec): Delete function.
	(gcn_full_exec_reg): Delete function.
	(gcn_inline_fp_constant_p): Support multiple vector sizes.
	(gcn_fp_constant_p): Likewise.
	(A): New macro.
	(GEN_VN_NOEXEC): New macro.
	(GEN_VNM_NOEXEC): New macro.
	(GEN_VN): New macro.
	(GEN_VNM): New macro.
	(GET_VN_FN): New macro.
	(CODE_FOR): New macro.
	(CODE_FOR_OP): New macro.
	(gen_mov_with_exec): Delete function.
	(gen_duplicate_load): Delete function.
	(gcn_expand_vector_init): Support multiple vector sizes.
	(strided_constant): Likewise.
	(gcn_addr_space_legitimize_address): Likewise.
	(gcn_expand_scalar_to_vector_address): Likewise.
	(gcn_expand_scaled_offsets): Likewise.
	(gcn_secondary_reload): Likewise.
	(gcn_valid_cvt_p): Likewise.
	(gcn_expand_builtin_1): Likewise.
	(gcn_make_vec_perm_address): Likewise.
	(gcn_vectorize_vec_perm_const): Likewise.
	(gcn_vector_mode_supported_p): Likewise.
	(gcn_autovectorize_vector_modes): New hook.
	(gcn_related_vector_mode): Support multiple vector sizes.
	(gcn_expand_dpp_shr_insn): Add FIXME comment.
	(gcn_md_reorg): Support multiple vector sizes.
	(print_reg): Likewise.
	(print_operand): Likewise.
	(TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES): New hook.
---
 gcc/config/gcn/gcn-modes.def |  82 ++++
 gcc/config/gcn/gcn-protos.h  |  22 +-
 gcc/config/gcn/gcn-valu.md   | 332 ++++++++++---
 gcc/config/gcn/gcn.cc        | 927 ++++++++++++++++++++++-------------
 4 files changed, 938 insertions(+), 425 deletions(-)
  

Patch

diff --git a/gcc/config/gcn/gcn-modes.def b/gcc/config/gcn/gcn-modes.def
index 82585de798b..1b8a3203463 100644
--- a/gcc/config/gcn/gcn-modes.def
+++ b/gcc/config/gcn/gcn-modes.def
@@ -29,6 +29,48 @@  VECTOR_MODE (FLOAT, HF, 64);    /*		  V64HF */
 VECTOR_MODE (FLOAT, SF, 64);    /*		  V64SF */
 VECTOR_MODE (FLOAT, DF, 64);    /*		  V64DF */
 
+/* Artificial vector modes, for when vector masking doesn't work (yet).  */
+VECTOR_MODE (INT, QI, 32);      /*		  V32QI */
+VECTOR_MODE (INT, HI, 32);      /*		  V32HI */
+VECTOR_MODE (INT, SI, 32);      /*		  V32SI */
+VECTOR_MODE (INT, DI, 32);      /*		  V32DI */
+VECTOR_MODE (INT, TI, 32);      /*		  V32TI */
+VECTOR_MODE (FLOAT, HF, 32);    /*		  V32HF */
+VECTOR_MODE (FLOAT, SF, 32);    /*		  V32SF */
+VECTOR_MODE (FLOAT, DF, 32);    /*		  V32DF */
+VECTOR_MODE (INT, QI, 16);      /*		  V16QI */
+VECTOR_MODE (INT, HI, 16);      /*		  V16HI */
+VECTOR_MODE (INT, SI, 16);      /*		  V16SI */
+VECTOR_MODE (INT, DI, 16);      /*		  V16DI */
+VECTOR_MODE (INT, TI, 16);      /*		  V16TI */
+VECTOR_MODE (FLOAT, HF, 16);    /*		  V16HF */
+VECTOR_MODE (FLOAT, SF, 16);    /*		  V16SF */
+VECTOR_MODE (FLOAT, DF, 16);    /*		  V16DF */
+VECTOR_MODE (INT, QI, 8);      /*		  V8QI */
+VECTOR_MODE (INT, HI, 8);      /*		  V8HI */
+VECTOR_MODE (INT, SI, 8);      /*		  V8SI */
+VECTOR_MODE (INT, DI, 8);      /*		  V8DI */
+VECTOR_MODE (INT, TI, 8);      /*		  V8TI */
+VECTOR_MODE (FLOAT, HF, 8);    /*		  V8HF */
+VECTOR_MODE (FLOAT, SF, 8);    /*		  V8SF */
+VECTOR_MODE (FLOAT, DF, 8);    /*		  V8DF */
+VECTOR_MODE (INT, QI, 4);      /*		  V4QI */
+VECTOR_MODE (INT, HI, 4);      /*		  V4HI */
+VECTOR_MODE (INT, SI, 4);      /*		  V4SI */
+VECTOR_MODE (INT, DI, 4);      /*		  V4DI */
+VECTOR_MODE (INT, TI, 4);      /*		  V4TI */
+VECTOR_MODE (FLOAT, HF, 4);    /*		  V4HF */
+VECTOR_MODE (FLOAT, SF, 4);    /*		  V4SF */
+VECTOR_MODE (FLOAT, DF, 4);    /*		  V4DF */
+VECTOR_MODE (INT, QI, 2);      /*		  V2QI */
+VECTOR_MODE (INT, HI, 2);      /*		  V2HI */
+VECTOR_MODE (INT, SI, 2);      /*		  V2SI */
+VECTOR_MODE (INT, DI, 2);      /*		  V2DI */
+VECTOR_MODE (INT, TI, 2);      /*		  V2TI */
+VECTOR_MODE (FLOAT, HF, 2);    /*		  V2HF */
+VECTOR_MODE (FLOAT, SF, 2);    /*		  V2SF */
+VECTOR_MODE (FLOAT, DF, 2);    /*		  V2DF */
+
 /* Vector units handle reads independently and thus no large alignment
    needed.  */
 ADJUST_ALIGNMENT (V64QI, 1);
@@ -39,3 +81,43 @@  ADJUST_ALIGNMENT (V64TI, 16);
 ADJUST_ALIGNMENT (V64HF, 2);
 ADJUST_ALIGNMENT (V64SF, 4);
 ADJUST_ALIGNMENT (V64DF, 8);
+ADJUST_ALIGNMENT (V32QI, 1);
+ADJUST_ALIGNMENT (V32HI, 2);
+ADJUST_ALIGNMENT (V32SI, 4);
+ADJUST_ALIGNMENT (V32DI, 8);
+ADJUST_ALIGNMENT (V32TI, 16);
+ADJUST_ALIGNMENT (V32HF, 2);
+ADJUST_ALIGNMENT (V32SF, 4);
+ADJUST_ALIGNMENT (V32DF, 8);
+ADJUST_ALIGNMENT (V16QI, 1);
+ADJUST_ALIGNMENT (V16HI, 2);
+ADJUST_ALIGNMENT (V16SI, 4);
+ADJUST_ALIGNMENT (V16DI, 8);
+ADJUST_ALIGNMENT (V16TI, 16);
+ADJUST_ALIGNMENT (V16HF, 2);
+ADJUST_ALIGNMENT (V16SF, 4);
+ADJUST_ALIGNMENT (V16DF, 8);
+ADJUST_ALIGNMENT (V8QI, 1);
+ADJUST_ALIGNMENT (V8HI, 2);
+ADJUST_ALIGNMENT (V8SI, 4);
+ADJUST_ALIGNMENT (V8DI, 8);
+ADJUST_ALIGNMENT (V8TI, 16);
+ADJUST_ALIGNMENT (V8HF, 2);
+ADJUST_ALIGNMENT (V8SF, 4);
+ADJUST_ALIGNMENT (V8DF, 8);
+ADJUST_ALIGNMENT (V4QI, 1);
+ADJUST_ALIGNMENT (V4HI, 2);
+ADJUST_ALIGNMENT (V4SI, 4);
+ADJUST_ALIGNMENT (V4DI, 8);
+ADJUST_ALIGNMENT (V4TI, 16);
+ADJUST_ALIGNMENT (V4HF, 2);
+ADJUST_ALIGNMENT (V4SF, 4);
+ADJUST_ALIGNMENT (V4DF, 8);
+ADJUST_ALIGNMENT (V2QI, 1);
+ADJUST_ALIGNMENT (V2HI, 2);
+ADJUST_ALIGNMENT (V2SI, 4);
+ADJUST_ALIGNMENT (V2DI, 8);
+ADJUST_ALIGNMENT (V2TI, 16);
+ADJUST_ALIGNMENT (V2HF, 2);
+ADJUST_ALIGNMENT (V2SF, 4);
+ADJUST_ALIGNMENT (V2DF, 8);
diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h
index ca804609c09..6300c1cbd36 100644
--- a/gcc/config/gcn/gcn-protos.h
+++ b/gcc/config/gcn/gcn-protos.h
@@ -34,8 +34,6 @@  extern rtx gcn_expand_scalar_to_vector_address (machine_mode, rtx, rtx, rtx);
 extern void gcn_expand_vector_init (rtx, rtx);
 extern bool gcn_flat_address_p (rtx, machine_mode);
 extern bool gcn_fp_constant_p (rtx, bool);
-extern rtx gcn_full_exec ();
-extern rtx gcn_full_exec_reg ();
 extern rtx gcn_gen_undef (machine_mode);
 extern bool gcn_global_address_p (rtx);
 extern tree gcn_goacc_adjust_private_decl (location_t, tree var, int level);
@@ -67,8 +65,6 @@  extern rtx gcn_operand_part (machine_mode, rtx, int);
 extern bool gcn_regno_mode_code_ok_for_base_p (int, machine_mode,
 					       addr_space_t, int, int);
 extern reg_class gcn_regno_reg_class (int regno);
-extern rtx gcn_scalar_exec ();
-extern rtx gcn_scalar_exec_reg ();
 extern bool gcn_scalar_flat_address_p (rtx);
 extern bool gcn_scalar_flat_mem_p (rtx);
 extern bool gcn_sgpr_move_p (rtx, rtx);
@@ -105,9 +101,11 @@  extern gimple_opt_pass *make_pass_omp_gcn (gcc::context *ctxt);
 inline bool
 vgpr_1reg_mode_p (machine_mode mode)
 {
-  return (mode == SImode || mode == SFmode || mode == HImode || mode == QImode
-	  || mode == V64QImode || mode == V64HImode || mode == V64SImode
-	  || mode == V64HFmode || mode == V64SFmode || mode == BImode);
+  if (VECTOR_MODE_P (mode))
+    mode = GET_MODE_INNER (mode);
+
+  return (mode == SImode || mode == SFmode || mode == HImode || mode == HFmode
+	  || mode == QImode || mode == BImode);
 }
 
 /* Return true if MODE is valid for 1 SGPR register.  */
@@ -124,8 +122,10 @@  sgpr_1reg_mode_p (machine_mode mode)
 inline bool
 vgpr_2reg_mode_p (machine_mode mode)
 {
-  return (mode == DImode || mode == DFmode
-	  || mode == V64DImode || mode == V64DFmode);
+  if (VECTOR_MODE_P (mode))
+    mode = GET_MODE_INNER (mode);
+
+  return (mode == DImode || mode == DFmode);
 }
 
 /* Return true if MODE can be handled directly by VGPR operations.  */
@@ -133,9 +133,7 @@  vgpr_2reg_mode_p (machine_mode mode)
 inline bool
 vgpr_vector_mode_p (machine_mode mode)
 {
-  return (mode == V64QImode || mode == V64HImode
-	  || mode == V64SImode || mode == V64DImode
-	  || mode == V64HFmode || mode == V64SFmode || mode == V64DFmode);
+  return VECTOR_MODE_P (mode);
 }
 
 
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index dec81e863f7..52d2fcb880a 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -17,88 +17,243 @@ 
 ;; {{{ Vector iterators
 
 ; Vector modes for specific types
-; (This will make more sense when there are multiple vector sizes)
 (define_mode_iterator V_QI
-		      [V64QI])
+		      [V2QI V4QI V8QI V16QI V32QI V64QI])
 (define_mode_iterator V_HI
-		      [V64HI])
+		      [V2HI V4HI V8HI V16HI V32HI V64HI])
 (define_mode_iterator V_HF
-		      [V64HF])
+		      [V2HF V4HF V8HF V16HF V32HF V64HF])
 (define_mode_iterator V_SI
-		      [V64SI])
+		      [V2SI V4SI V8SI V16SI V32SI V64SI])
 (define_mode_iterator V_SF
-		      [V64SF])
+		      [V2SF V4SF V8SF V16SF V32SF V64SF])
 (define_mode_iterator V_DI
-		      [V64DI])
+		      [V2DI V4DI V8DI V16DI V32DI V64DI])
 (define_mode_iterator V_DF
-		      [V64DF])
+		      [V2DF V4DF V8DF V16DF V32DF V64DF])
+
+(define_mode_iterator V64_SI
+		      [V64SI])
+(define_mode_iterator V64_DI
+		      [V64DI])
 
 ; Vector modes for sub-dword modes
 (define_mode_iterator V_QIHI
-		      [V64QI V64HI])
+		      [V2QI V2HI
+		       V4QI V4HI
+		       V8QI V8HI
+		       V16QI V16HI
+		       V32QI V32HI
+		       V64QI V64HI])
 
 ; Vector modes for one vector register
 (define_mode_iterator V_1REG
-		      [V64QI V64HI V64SI V64HF V64SF])
+		      [V2QI V2HI V2SI V2HF V2SF
+		       V4QI V4HI V4SI V4HF V4SF
+		       V8QI V8HI V8SI V8HF V8SF
+		       V16QI V16HI V16SI V16HF V16SF
+		       V32QI V32HI V32SI V32HF V32SF
+		       V64QI V64HI V64SI V64HF V64SF])
 
 (define_mode_iterator V_INT_1REG
-		      [V64QI V64HI V64SI])
+		      [V2QI V2HI V2SI
+		       V4QI V4HI V4SI
+		       V8QI V8HI V8SI
+		       V16QI V16HI V16SI
+		       V32QI V32HI V32SI
+		       V64QI V64HI V64SI])
 (define_mode_iterator V_INT_1REG_ALT
-		      [V64QI V64HI V64SI])
+		      [V2QI V2HI V2SI
+		       V4QI V4HI V4SI
+		       V8QI V8HI V8SI
+		       V16QI V16HI V16SI
+		       V32QI V32HI V32SI
+		       V64QI V64HI V64SI])
 (define_mode_iterator V_FP_1REG
-		      [V64HF V64SF])
+		      [V2HF V2SF
+		       V4HF V4SF
+		       V8HF V8SF
+		       V16HF V16SF
+		       V32HF V32SF
+		       V64HF V64SF])
+
+; V64_* modes are for where more general support is unimplemented
+; (e.g. reductions)
+(define_mode_iterator V64_1REG
+		      [V64QI V64HI V64SI V64HF V64SF])
+(define_mode_iterator V64_INT_1REG
+		      [V64QI V64HI V64SI])
 
 ; Vector modes for two vector registers
 (define_mode_iterator V_2REG
+		      [V2DI V2DF
+		       V4DI V4DF
+		       V8DI V8DF
+		       V16DI V16DF
+		       V32DI V32DF
+		       V64DI V64DF])
+
+(define_mode_iterator V64_2REG
 		      [V64DI V64DF])
 
 ; Vector modes with native support
 (define_mode_iterator V_noQI
-		      [V64HI V64HF V64SI V64SF V64DI V64DF])
+		      [V2HI V2HF V2SI V2SF V2DI V2DF
+		       V4HI V4HF V4SI V4SF V4DI V4DF
+		       V8HI V8HF V8SI V8SF V8DI V8DF
+		       V16HI V16HF V16SI V16SF V16DI V16DF
+		       V32HI V32HF V32SI V32SF V32DI V32DF
+		       V64HI V64HF V64SI V64SF V64DI V64DF])
 (define_mode_iterator V_noHI
-		      [V64HF V64SI V64SF V64DI V64DF])
+		      [V2HF V2SI V2SF V2DI V2DF
+		       V4HF V4SI V4SF V4DI V4DF
+		       V8HF V8SI V8SF V8DI V8DF
+		       V16HF V16SI V16SF V16DI V16DF
+		       V32HF V32SI V32SF V32DI V32DF
+		       V64HF V64SI V64SF V64DI V64DF])
 
 (define_mode_iterator V_INT_noQI
-		      [V64HI V64SI V64DI])
+		      [V2HI V2SI V2DI
+		       V4HI V4SI V4DI
+		       V8HI V8SI V8DI
+		       V16HI V16SI V16DI
+		       V32HI V32SI V32DI
+		       V64HI V64SI V64DI])
 (define_mode_iterator V_INT_noHI
-		      [V64SI V64DI])
+		      [V2SI V2DI
+		       V4SI V4DI
+		       V8SI V8DI
+		       V16SI V16DI
+		       V32SI V32DI
+		       V64SI V64DI])
 
 ; All of above
 (define_mode_iterator V_ALL
-		      [V64QI V64HI V64HF V64SI V64SF V64DI V64DF])
+		      [V2QI V2HI V2HF V2SI V2SF V2DI V2DF
+		       V4QI V4HI V4HF V4SI V4SF V4DI V4DF
+		       V8QI V8HI V8HF V8SI V8SF V8DI V8DF
+		       V16QI V16HI V16HF V16SI V16SF V16DI V16DF
+		       V32QI V32HI V32HF V32SI V32SF V32DI V32DF
+		       V64QI V64HI V64HF V64SI V64SF V64DI V64DF])
 (define_mode_iterator V_ALL_ALT
-		      [V64QI V64HI V64HF V64SI V64SF V64DI V64DF])
+		      [V2QI V2HI V2HF V2SI V2SF V2DI V2DF
+		       V4QI V4HI V4HF V4SI V4SF V4DI V4DF
+		       V8QI V8HI V8HF V8SI V8SF V8DI V8DF
+		       V16QI V16HI V16HF V16SI V16SF V16DI V16DF
+		       V32QI V32HI V32HF V32SI V32SF V32DI V32DF
+		       V64QI V64HI V64HF V64SI V64SF V64DI V64DF])
 
 (define_mode_iterator V_INT
-		      [V64QI V64HI V64SI V64DI])
+		      [V2QI V2HI V2SI V2DI
+		       V4QI V4HI V4SI V4DI
+		       V8QI V8HI V8SI V8DI
+		       V16QI V16HI V16SI V16DI
+		       V32QI V32HI V32SI V32DI
+		       V64QI V64HI V64SI V64DI])
 (define_mode_iterator V_FP
+		      [V2HF V2SF V2DF
+		       V4HF V4SF V4DF
+		       V8HF V8SF V8DF
+		       V16HF V16SF V16DF
+		       V32HF V32SF V32DF
+		       V64HF V64SF V64DF])
+
+(define_mode_iterator V64_ALL
+		      [V64QI V64HI V64HF V64SI V64SF V64DI V64DF])
+(define_mode_iterator V64_FP
 		      [V64HF V64SF V64DF])
 
 (define_mode_attr scalar_mode
-  [(V64QI "qi") (V64HI "hi") (V64SI "si")
+  [(V2QI "qi") (V2HI "hi") (V2SI "si")
+   (V2HF "hf") (V2SF "sf") (V2DI "di") (V2DF "df")
+   (V4QI "qi") (V4HI "hi") (V4SI "si")
+   (V4HF "hf") (V4SF "sf") (V4DI "di") (V4DF "df")
+   (V8QI "qi") (V8HI "hi") (V8SI "si")
+   (V8HF "hf") (V8SF "sf") (V8DI "di") (V8DF "df")
+   (V16QI "qi") (V16HI "hi") (V16SI "si")
+   (V16HF "hf") (V16SF "sf") (V16DI "di") (V16DF "df")
+   (V32QI "qi") (V32HI "hi") (V32SI "si")
+   (V32HF "hf") (V32SF "sf") (V32DI "di") (V32DF "df")
+   (V64QI "qi") (V64HI "hi") (V64SI "si")
    (V64HF "hf") (V64SF "sf") (V64DI "di") (V64DF "df")])
 
 (define_mode_attr SCALAR_MODE
-  [(V64QI "QI") (V64HI "HI") (V64SI "SI")
+  [(V2QI "QI") (V2HI "HI") (V2SI "SI")
+   (V2HF "HF") (V2SF "SF") (V2DI "DI") (V2DF "DF")
+   (V4QI "QI") (V4HI "HI") (V4SI "SI")
+   (V4HF "HF") (V4SF "SF") (V4DI "DI") (V4DF "DF")
+   (V8QI "QI") (V8HI "HI") (V8SI "SI")
+   (V8HF "HF") (V8SF "SF") (V8DI "DI") (V8DF "DF")
+   (V16QI "QI") (V16HI "HI") (V16SI "SI")
+   (V16HF "HF") (V16SF "SF") (V16DI "DI") (V16DF "DF")
+   (V32QI "QI") (V32HI "HI") (V32SI "SI")
+   (V32HF "HF") (V32SF "SF") (V32DI "DI") (V32DF "DF")
+   (V64QI "QI") (V64HI "HI") (V64SI "SI")
    (V64HF "HF") (V64SF "SF") (V64DI "DI") (V64DF "DF")])
 
 (define_mode_attr vnsi
-  [(V64QI "v64si") (V64HI "v64si") (V64HF "v64si") (V64SI "v64si")
+  [(V2QI "v2si") (V2HI "v2si") (V2HF "v2si") (V2SI "v2si")
+   (V2SF "v2si") (V2DI "v2si") (V2DF "v2si")
+   (V4QI "v4si") (V4HI "v4si") (V4HF "v4si") (V4SI "v4si")
+   (V4SF "v4si") (V4DI "v4si") (V4DF "v4si")
+   (V8QI "v8si") (V8HI "v8si") (V8HF "v8si") (V8SI "v8si")
+   (V8SF "v8si") (V8DI "v8si") (V8DF "v8si")
+   (V16QI "v16si") (V16HI "v16si") (V16HF "v16si") (V16SI "v16si")
+   (V16SF "v16si") (V16DI "v16si") (V16DF "v16si")
+   (V32QI "v32si") (V32HI "v32si") (V32HF "v32si") (V32SI "v32si")
+   (V32SF "v32si") (V32DI "v32si") (V32DF "v32si")
+   (V64QI "v64si") (V64HI "v64si") (V64HF "v64si") (V64SI "v64si")
    (V64SF "v64si") (V64DI "v64si") (V64DF "v64si")])
 
 (define_mode_attr VnSI
-  [(V64QI "V64SI") (V64HI "V64SI") (V64HF "V64SI") (V64SI "V64SI")
+  [(V2QI "V2SI") (V2HI "V2SI") (V2HF "V2SI") (V2SI "V2SI")
+   (V2SF "V2SI") (V2DI "V2SI") (V2DF "V2SI")
+   (V4QI "V4SI") (V4HI "V4SI") (V4HF "V4SI") (V4SI "V4SI")
+   (V4SF "V4SI") (V4DI "V4SI") (V4DF "V4SI")
+   (V8QI "V8SI") (V8HI "V8SI") (V8HF "V8SI") (V8SI "V8SI")
+   (V8SF "V8SI") (V8DI "V8SI") (V8DF "V8SI")
+   (V16QI "V16SI") (V16HI "V16SI") (V16HF "V16SI") (V16SI "V16SI")
+   (V16SF "V16SI") (V16DI "V16SI") (V16DF "V16SI")
+   (V32QI "V32SI") (V32HI "V32SI") (V32HF "V32SI") (V32SI "V32SI")
+   (V32SF "V32SI") (V32DI "V32SI") (V32DF "V32SI")
+   (V64QI "V64SI") (V64HI "V64SI") (V64HF "V64SI") (V64SI "V64SI")
    (V64SF "V64SI") (V64DI "V64SI") (V64DF "V64SI")])
 
 (define_mode_attr vndi
-  [(V64QI "v64di") (V64HI "v64di") (V64HF "v64di") (V64SI "v64di")
+  [(V2QI "v2di") (V2HI "v2di") (V2HF "v2di") (V2SI "v2di")
+   (V2SF "v2di") (V2DI "v2di") (V2DF "v2di")
+   (V4QI "v4di") (V4HI "v4di") (V4HF "v4di") (V4SI "v4di")
+   (V4SF "v4di") (V4DI "v4di") (V4DF "v4di")
+   (V8QI "v8di") (V8HI "v8di") (V8HF "v8di") (V8SI "v8di")
+   (V8SF "v8di") (V8DI "v8di") (V8DF "v8di")
+   (V16QI "v16di") (V16HI "v16di") (V16HF "v16di") (V16SI "v16di")
+   (V16SF "v16di") (V16DI "v16di") (V16DF "v16di")
+   (V32QI "v32di") (V32HI "v32di") (V32HF "v32di") (V32SI "v32di")
+   (V32SF "v32di") (V32DI "v32di") (V32DF "v32di")
+   (V64QI "v64di") (V64HI "v64di") (V64HF "v64di") (V64SI "v64di")
    (V64SF "v64di") (V64DI "v64di") (V64DF "v64di")])
 
 (define_mode_attr VnDI
-  [(V64QI "V64DI") (V64HI "V64DI") (V64HF "V64DI") (V64SI "V64DI")
+  [(V2QI "V2DI") (V2HI "V2DI") (V2HF "V2DI") (V2SI "V2DI")
+   (V2SF "V2DI") (V2DI "V2DI") (V2DF "V2DI")
+   (V4QI "V4DI") (V4HI "V4DI") (V4HF "V4DI") (V4SI "V4DI")
+   (V4SF "V4DI") (V4DI "V4DI") (V4DF "V4DI")
+   (V8QI "V8DI") (V8HI "V8DI") (V8HF "V8DI") (V8SI "V8DI")
+   (V8SF "V8DI") (V8DI "V8DI") (V8DF "V8DI")
+   (V16QI "V16DI") (V16HI "V16DI") (V16HF "V16DI") (V16SI "V16DI")
+   (V16SF "V16DI") (V16DI "V16DI") (V16DF "V16DI")
+   (V32QI "V32DI") (V32HI "V32DI") (V32HF "V32DI") (V32SI "V32DI")
+   (V32SF "V32DI") (V32DI "V32DI") (V32DF "V32DI")
+   (V64QI "V64DI") (V64HI "V64DI") (V64HF "V64DI") (V64SI "V64DI")
    (V64SF "V64DI") (V64DI "V64DI") (V64DF "V64DI")])
 
-(define_mode_attr sdwa [(V64QI "BYTE_0") (V64HI "WORD_0") (V64SI "DWORD")])
+(define_mode_attr sdwa
+  [(V2QI "BYTE_0") (V2HI "WORD_0") (V2SI "DWORD")
+   (V4QI "BYTE_0") (V4HI "WORD_0") (V4SI "DWORD")
+   (V8QI "BYTE_0") (V8HI "WORD_0") (V8SI "DWORD")
+   (V16QI "BYTE_0") (V16HI "WORD_0") (V16SI "DWORD")
+   (V32QI "BYTE_0") (V32HI "WORD_0") (V32SI "DWORD")
+   (V64QI "BYTE_0") (V64HI "WORD_0") (V64SI "DWORD")])
 
 ;; }}}
 ;; {{{ Substitutions
@@ -180,6 +335,37 @@  (define_expand "mov<mode>"
 	(match_operand:V_ALL 1 "general_operand"))]
   ""
   {
+    /* Bitwise reinterpret casts via SUBREG don't work with GCN vector
+       registers, but we can convert the MEM to a mode that does work.  */
+    if (MEM_P (operands[0]) && !SUBREG_P (operands[0])
+	&& SUBREG_P (operands[1])
+	&& GET_MODE_SIZE (GET_MODE (operands[1]))
+	   == GET_MODE_SIZE (GET_MODE (SUBREG_REG (operands[1]))))
+      {
+        rtx src = SUBREG_REG (operands[1]);
+        rtx mem = copy_rtx (operands[0]);
+	PUT_MODE_RAW (mem, GET_MODE (src));
+	emit_move_insn (mem, src);
+	DONE;
+      }
+    if (MEM_P (operands[1]) && !SUBREG_P (operands[1])
+	&& SUBREG_P (operands[0])
+	&& GET_MODE_SIZE (GET_MODE (operands[0]))
+	   == GET_MODE_SIZE (GET_MODE (SUBREG_REG (operands[0]))))
+      {
+        rtx dest = SUBREG_REG (operands[0]);
+        rtx mem = copy_rtx (operands[1]);
+	PUT_MODE_RAW (mem, GET_MODE (dest));
+	emit_move_insn (dest, mem);
+	DONE;
+      }
+
+    /* SUBREG of MEM is not supported.  */
+    gcc_assert ((!SUBREG_P (operands[0])
+		 || !MEM_P (SUBREG_REG (operands[0])))
+		&& (!SUBREG_P (operands[1])
+		    || !MEM_P (SUBREG_REG (operands[1]))));
+
     if (MEM_P (operands[0]) && !lra_in_progress && !reload_completed)
       {
 	operands[1] = force_reg (<MODE>mode, operands[1]);
@@ -2419,10 +2605,10 @@  (define_insn "ldexp<mode>3"
    (set_attr "length" "8")])
 
 (define_insn "ldexp<mode>3<exec>"
-  [(set (match_operand:V_FP 0 "register_operand"  "=v")
+  [(set (match_operand:V_FP 0 "register_operand"     "=  v")
 	(unspec:V_FP
-	  [(match_operand:V_FP 1 "gcn_alu_operand" "vB")
-	   (match_operand:V64SI 2 "gcn_alu_operand" "vSvA")]
+	  [(match_operand:V_FP 1 "gcn_alu_operand"   "  vB")
+	   (match_operand:<VnSI> 2 "gcn_alu_operand" "vSvA")]
 	  UNSPEC_LDEXP))]
   ""
   "v_ldexp%i0\t%0, %1, %2"
@@ -2452,8 +2638,8 @@  (define_insn "frexp<mode>_mant2"
    (set_attr "length" "8")])
 
 (define_insn "frexp<mode>_exp2<exec>"
-  [(set (match_operand:V64SI 0 "register_operand" "=v")
-	(unspec:V64SI
+  [(set (match_operand:<VnSI> 0 "register_operand" "=v")
+	(unspec:<VnSI>
 	  [(match_operand:V_FP 1 "gcn_alu_operand" "vB")]
 	  UNSPEC_FREXP_EXP))]
   ""
@@ -2640,9 +2826,27 @@  (define_expand "div<mode>3"
 (define_mode_iterator CVT_FROM_MODE [HI SI HF SF DF])
 (define_mode_iterator CVT_TO_MODE [HI SI HF SF DF])
 
-(define_mode_iterator VCVT_MODE [V64HI V64SI V64HF V64SF V64DF])
-(define_mode_iterator VCVT_FMODE [V64HF V64SF V64DF])
-(define_mode_iterator VCVT_IMODE [V64HI V64SI])
+(define_mode_iterator VCVT_MODE
+		      [V2HI V2SI V2HF V2SF V2DF
+		       V4HI V4SI V4HF V4SF V4DF
+		       V8HI V8SI V8HF V8SF V8DF
+		       V16HI V16SI V16HF V16SF V16DF
+		       V32HI V32SI V32HF V32SF V32DF
+		       V64HI V64SI V64HF V64SF V64DF])
+(define_mode_iterator VCVT_FMODE
+		      [V2HF V2SF V2DF
+		       V4HF V4SF V4DF
+		       V8HF V8SF V8DF
+		       V16HF V16SF V16DF
+		       V32HF V32SF V32DF
+		       V64HF V64SF V64DF])
+(define_mode_iterator VCVT_IMODE
+		      [V2HI V2SI
+		       V4HI V4SI
+		       V8HI V8SI
+		       V16HI V16SI
+		       V32HI V32SI
+		       V64HI V64SI])
 
 (define_code_iterator cvt_op [fix unsigned_fix
 			      float unsigned_float
@@ -3265,7 +3469,7 @@  (define_int_attr reduc_insn [(UNSPEC_SMIN_DPP_SHR "v_min%i0")
 (define_expand "reduc_<reduc_op>_scal_<mode>"
   [(set (match_operand:<SCALAR_MODE> 0 "register_operand")
 	(unspec:<SCALAR_MODE>
-	  [(match_operand:V_ALL 1 "register_operand")]
+	  [(match_operand:V64_ALL 1 "register_operand")]
 	  REDUC_UNSPEC))]
   ""
   {
@@ -3284,7 +3488,7 @@  (define_expand "reduc_<reduc_op>_scal_<mode>"
 (define_expand "fold_left_plus_<mode>"
  [(match_operand:<SCALAR_MODE> 0 "register_operand")
   (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand")
-  (match_operand:V_FP 2 "gcn_alu_operand")]
+  (match_operand:V64_FP 2 "gcn_alu_operand")]
   "can_create_pseudo_p ()
    && (flag_openacc || flag_openmp
        || flag_associative_math)"
@@ -3300,11 +3504,11 @@  (define_expand "fold_left_plus_<mode>"
    })
 
 (define_insn "*<reduc_op>_dpp_shr_<mode>"
-  [(set (match_operand:V_1REG 0 "register_operand"   "=v")
-	(unspec:V_1REG
-	  [(match_operand:V_1REG 1 "register_operand" "v")
-	   (match_operand:V_1REG 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"    "n")]
+  [(set (match_operand:V64_1REG 0 "register_operand"   "=v")
+	(unspec:V64_1REG
+	  [(match_operand:V64_1REG 1 "register_operand" "v")
+	   (match_operand:V64_1REG 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"      "n")]
 	  REDUC_UNSPEC))]
   ; GCN3 requires a carry out, GCN5 not
   "!(TARGET_GCN3 && SCALAR_INT_MODE_P (<SCALAR_MODE>mode)
@@ -3317,11 +3521,11 @@  (define_insn "*<reduc_op>_dpp_shr_<mode>"
    (set_attr "length" "8")])
 
 (define_insn_and_split "*<reduc_op>_dpp_shr_<mode>"
-  [(set (match_operand:V_DI 0 "register_operand"    "=v")
-	(unspec:V_DI
-	  [(match_operand:V_DI 1 "register_operand" "v")
-	   (match_operand:V_DI 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"  "n")]
+  [(set (match_operand:V64_DI 0 "register_operand"    "=v")
+	(unspec:V64_DI
+	  [(match_operand:V64_DI 1 "register_operand" "v")
+	   (match_operand:V64_DI 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"    "n")]
 	  REDUC_2REG_UNSPEC))]
   ""
   "#"
@@ -3346,10 +3550,10 @@  (define_insn_and_split "*<reduc_op>_dpp_shr_<mode>"
 ; Special cases for addition.
 
 (define_insn "*plus_carry_dpp_shr_<mode>"
-  [(set (match_operand:V_INT_1REG 0 "register_operand"   "=v")
-	(unspec:V_INT_1REG
-	  [(match_operand:V_INT_1REG 1 "register_operand" "v")
-	   (match_operand:V_INT_1REG 2 "register_operand" "v")
+  [(set (match_operand:V64_INT_1REG 0 "register_operand"   "=v")
+	(unspec:V64_INT_1REG
+	  [(match_operand:V64_INT_1REG 1 "register_operand" "v")
+	   (match_operand:V64_INT_1REG 2 "register_operand" "v")
 	   (match_operand:SI 3 "const_int_operand"	  "n")]
 	  UNSPEC_PLUS_CARRY_DPP_SHR))
    (clobber (reg:DI VCC_REG))]
@@ -3363,12 +3567,12 @@  (define_insn "*plus_carry_dpp_shr_<mode>"
    (set_attr "length" "8")])
 
 (define_insn "*plus_carry_in_dpp_shr_<mode>"
-  [(set (match_operand:V_SI 0 "register_operand"    "=v")
-	(unspec:V_SI
-	  [(match_operand:V_SI 1 "register_operand" "v")
-	   (match_operand:V_SI 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"  "n")
-	   (match_operand:DI 4 "register_operand"   "cV")]
+  [(set (match_operand:V64_SI 0 "register_operand"    "=v")
+	(unspec:V64_SI
+	  [(match_operand:V64_SI 1 "register_operand" "v")
+	   (match_operand:V64_SI 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"    "n")
+	   (match_operand:DI 4 "register_operand"     "cV")]
 	  UNSPEC_PLUS_CARRY_IN_DPP_SHR))
    (clobber (reg:DI VCC_REG))]
   ""
@@ -3381,11 +3585,11 @@  (define_insn "*plus_carry_in_dpp_shr_<mode>"
    (set_attr "length" "8")])
 
 (define_insn_and_split "*plus_carry_dpp_shr_<mode>"
-  [(set (match_operand:V_DI 0 "register_operand"    "=v")
-	(unspec:V_DI
-	  [(match_operand:V_DI 1 "register_operand" "v")
-	   (match_operand:V_DI 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"  "n")]
+  [(set (match_operand:V64_DI 0 "register_operand"    "=v")
+	(unspec:V64_DI
+	  [(match_operand:V64_DI 1 "register_operand" "v")
+	   (match_operand:V64_DI 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"    "n")]
 	  UNSPEC_PLUS_CARRY_DPP_SHR))
    (clobber (reg:DI VCC_REG))]
   ""
@@ -3416,7 +3620,7 @@  (define_insn_and_split "*plus_carry_dpp_shr_<mode>"
 (define_insn "mov_from_lane63_<mode>"
   [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
 	(unspec:<SCALAR_MODE>
-	  [(match_operand:V_1REG 1 "register_operand"	  "  v,v")]
+	  [(match_operand:V64_1REG 1 "register_operand"	  "  v,v")]
 	  UNSPEC_MOV_FROM_LANE63))]
   ""
   "@
@@ -3429,7 +3633,7 @@  (define_insn "mov_from_lane63_<mode>"
 (define_insn "mov_from_lane63_<mode>"
   [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
 	(unspec:<SCALAR_MODE>
-	  [(match_operand:V_2REG 1 "register_operand"	  "  v,v")]
+	  [(match_operand:V64_2REG 1 "register_operand"	  "  v,v")]
 	  UNSPEC_MOV_FROM_LANE63))]
   ""
   "@
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index c27ee91210e..e1636f6ddd6 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -395,6 +395,97 @@  gcn_scalar_mode_supported_p (scalar_mode mode)
 	  || mode == TImode);
 }
 
+/* Return a vector mode with N lanes of MODE.  */
+
+static machine_mode
+VnMODE (int n, machine_mode mode)
+{
+  switch (mode)
+    {
+    case QImode:
+      switch (n)
+	{
+	case 2: return V2QImode;
+	case 4: return V4QImode;
+	case 8: return V8QImode;
+	case 16: return V16QImode;
+	case 32: return V32QImode;
+	case 64: return V64QImode;
+	}
+      break;
+    case HImode:
+      switch (n)
+	{
+	case 2: return V2HImode;
+	case 4: return V4HImode;
+	case 8: return V8HImode;
+	case 16: return V16HImode;
+	case 32: return V32HImode;
+	case 64: return V64HImode;
+	}
+      break;
+    case HFmode:
+      switch (n)
+	{
+	case 2: return V2HFmode;
+	case 4: return V4HFmode;
+	case 8: return V8HFmode;
+	case 16: return V16HFmode;
+	case 32: return V32HFmode;
+	case 64: return V64HFmode;
+	}
+      break;
+    case SImode:
+      switch (n)
+	{
+	case 2: return V2SImode;
+	case 4: return V4SImode;
+	case 8: return V8SImode;
+	case 16: return V16SImode;
+	case 32: return V32SImode;
+	case 64: return V64SImode;
+	}
+      break;
+    case SFmode:
+      switch (n)
+	{
+	case 2: return V2SFmode;
+	case 4: return V4SFmode;
+	case 8: return V8SFmode;
+	case 16: return V16SFmode;
+	case 32: return V32SFmode;
+	case 64: return V64SFmode;
+	}
+      break;
+    case DImode:
+      switch (n)
+	{
+	case 2: return V2DImode;
+	case 4: return V4DImode;
+	case 8: return V8DImode;
+	case 16: return V16DImode;
+	case 32: return V32DImode;
+	case 64: return V64DImode;
+	}
+      break;
+    case DFmode:
+      switch (n)
+	{
+	case 2: return V2DFmode;
+	case 4: return V4DFmode;
+	case 8: return V8DFmode;
+	case 16: return V16DFmode;
+	case 32: return V32DFmode;
+	case 64: return V64DFmode;
+	}
+      break;
+    default:
+      break;
+    }
+
+  return VOIDmode;
+}
+
 /* Implement TARGET_CLASS_MAX_NREGS.
  
    Return the number of hard registers needed to hold a value of MODE in
@@ -556,6 +647,23 @@  gcn_can_change_mode_class (machine_mode from, machine_mode to,
 {
   if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
     return true;
+
+  /* Vector conversions are only valid when changing mode with a fixed number
+     of lanes, or changing number of lanes with a fixed mode.  Anything else
+     would require actual data movement.  */
+  if (VECTOR_MODE_P (from) && VECTOR_MODE_P (to)
+      && GET_MODE_NUNITS (from) != GET_MODE_NUNITS (to)
+      && GET_MODE_INNER (from) != GET_MODE_INNER (to))
+    return false;
+
+  /* Vector/scalar conversions are only permitted when the scalar mode
+     is the same or smaller than the inner vector mode.  */
+  if ((VECTOR_MODE_P (from) && !VECTOR_MODE_P (to)
+       && GET_MODE_SIZE (to) >= GET_MODE_SIZE (GET_MODE_INNER (from)))
+      || (VECTOR_MODE_P (to) && !VECTOR_MODE_P (from)
+	  && GET_MODE_SIZE (from) >= GET_MODE_SIZE (GET_MODE_INNER (to))))
+    return false;
+
   return (gcn_class_max_nregs (regclass, from)
 	  == gcn_class_max_nregs (regclass, to));
 }
@@ -595,6 +703,16 @@  gcn_class_likely_spilled_p (reg_class_t rclass)
 bool
 gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
 {
+  if (VECTOR_MODE_P (mode1) || VECTOR_MODE_P (mode2))
+    {
+      int vf1 = (VECTOR_MODE_P (mode1) ? GET_MODE_NUNITS (mode1) : 1);
+      int vf2 = (VECTOR_MODE_P (mode2) ? GET_MODE_NUNITS (mode2) : 1);
+      machine_mode inner1 = (vf1 > 1 ? GET_MODE_INNER (mode1) : mode1);
+      machine_mode inner2 = (vf2 > 1 ? GET_MODE_INNER (mode2) : mode2);
+
+      return (vf1 == vf2 || (inner1 == inner2 && vf2 <= vf1));
+    }
+
   return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
 	  && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
 }
@@ -616,14 +734,16 @@  gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
 rtx
 gcn_operand_part (machine_mode mode, rtx op, int n)
 {
-  if (GET_MODE_SIZE (mode) >= 256)
+  int vf = VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 1;
+
+  if (vf > 1)
     {
-      /*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0);  */
+      machine_mode vsimode = VnMODE (vf, SImode);
 
       if (REG_P (op))
 	{
 	  gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
-	  return gen_rtx_REG (V64SImode, REGNO (op) + n);
+	  return gen_rtx_REG (vsimode, REGNO (op) + n);
 	}
       if (GET_CODE (op) == CONST_VECTOR)
 	{
@@ -634,10 +754,10 @@  gcn_operand_part (machine_mode mode, rtx op, int n)
 	    RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
 						 CONST_VECTOR_ELT (op, i), n);
 
-	  return gen_rtx_CONST_VECTOR (V64SImode, v);
+	  return gen_rtx_CONST_VECTOR (vsimode, v);
 	}
       if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
-	return gcn_gen_undef (V64SImode);
+	return gcn_gen_undef (vsimode);
       gcc_unreachable ();
     }
   else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
@@ -734,38 +854,6 @@  get_exec (int64_t val)
   return reg;
 }
 
-/* Return value of scalar exec register.  */
-
-rtx
-gcn_scalar_exec ()
-{
-  return const1_rtx;
-}
-
-/* Return pseudo holding scalar exec register.  */
-
-rtx
-gcn_scalar_exec_reg ()
-{
-  return get_exec (1);
-}
-
-/* Return value of full exec register.  */
-
-rtx
-gcn_full_exec ()
-{
-  return constm1_rtx;
-}
-
-/* Return pseudo holding full exec register.  */
-
-rtx
-gcn_full_exec_reg ()
-{
-  return get_exec (-1);
-}
-
 /* }}}  */
 /* {{{ Immediate constants.  */
 
@@ -802,8 +890,13 @@  int
 gcn_inline_fp_constant_p (rtx x, bool allow_vector)
 {
   machine_mode mode = GET_MODE (x);
+  int vf = VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 1;
 
-  if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
+  if (vf > 1)
+    mode = GET_MODE_INNER (mode);
+
+  if (vf > 1
+      && (mode == HFmode || mode == SFmode || mode == DFmode)
       && allow_vector)
     {
       int n;
@@ -812,7 +905,7 @@  gcn_inline_fp_constant_p (rtx x, bool allow_vector)
       n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
       if (!n)
 	return 0;
-      for (int i = 1; i < 64; i++)
+      for (int i = 1; i < vf; i++)
 	if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
 	  return 0;
       return 1;
@@ -867,8 +960,13 @@  bool
 gcn_fp_constant_p (rtx x, bool allow_vector)
 {
   machine_mode mode = GET_MODE (x);
+  int vf = VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 1;
 
-  if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
+  if (vf > 1)
+    mode = GET_MODE_INNER (mode);
+
+  if (vf > 1
+      && (mode == HFmode || mode == SFmode || mode == DFmode)
       && allow_vector)
     {
       int n;
@@ -877,7 +975,7 @@  gcn_fp_constant_p (rtx x, bool allow_vector)
       n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
       if (!n)
 	return false;
-      for (int i = 1; i < 64; i++)
+      for (int i = 1; i < vf; i++)
 	if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
 	  return false;
       return true;
@@ -1090,6 +1188,244 @@  gcn_gen_undef (machine_mode mode)
   return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
 }
 
+/* }}}  */
+/* {{{ Utility functions.  */
+
+/*  Generalised accessor functions for instruction patterns.
+    The machine desription '@' prefix does something similar, but as of
+    GCC 10 is incompatible with define_subst, and anyway it doesn't
+    auto-handle the exec feature.
+
+    Four macros are provided; each function only needs one:
+
+    GEN_VN         - create accessor functions for all sizes of one mode
+    GEN_VNM        - create accessor functions for all sizes of all modes
+    GEN_VN_NOEXEC  - for insns without "_exec" variants
+    GEN_VNM_NOEXEC - likewise
+ 
+    E.g.  add<mode>3
+      GEN_VNM (add, 3, A(rtx dest, rtx s1, rtx s2), A(dest, s1, s2)
+
+      gen_addvNsi3 (dst, a, b)
+        -> calls gen_addv64si3, or gen_addv32si3, etc.
+
+      gen_addvNm3 (dst, a, b)
+        -> calls gen_addv64qi3, or gen_addv2di3, etc.
+
+    The mode is determined from the first parameter, which must be called
+    "dest" (or else the macro doesn't work).
+
+    Each function has two optional parameters at the end: merge_src and exec.
+    If exec is non-null, the function will call the "_exec" variant of the
+    insn.  If exec is non-null but merge_src is null then an undef unspec
+    will be created.
+
+    E.g. cont.
+      gen_addvNsi3 (v64sidst, a, b, oldval, exec)
+        -> calls gen_addv64si3_exec (v64sidst, a, b, oldval, exec)
+
+      gen_addvNm3 (v2qidst, a, b, NULL, exec)
+        -> calls gen_addv2qi3_exec (v2qidst, a, b,
+                                    gcn_gen_undef (V2QImode), exec)
+   */
+
+#define A(...) __VA_ARGS__
+#define GEN_VN_NOEXEC(PREFIX, SUFFIX, PARAMS, ARGS) \
+static rtx \
+gen_##PREFIX##vN##SUFFIX (PARAMS) \
+{ \
+  machine_mode mode = GET_MODE (dest); \
+  int n = GET_MODE_NUNITS (mode); \
+  \
+  switch (n) \
+    { \
+    case 2: return gen_##PREFIX##v2##SUFFIX (ARGS); \
+    case 4: return gen_##PREFIX##v4##SUFFIX (ARGS); \
+    case 8: return gen_##PREFIX##v8##SUFFIX (ARGS); \
+    case 16: return gen_##PREFIX##v16##SUFFIX (ARGS); \
+    case 32: return gen_##PREFIX##v32##SUFFIX (ARGS); \
+    case 64: return gen_##PREFIX##v64##SUFFIX (ARGS); \
+    } \
+  \
+  gcc_unreachable (); \
+  return NULL_RTX; \
+}
+
+#define GEN_VNM_NOEXEC(PREFIX, SUFFIX, PARAMS, ARGS) \
+GEN_VN_NOEXEC (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS)) \
+GEN_VN_NOEXEC (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS)) \
+GEN_VN_NOEXEC (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS)) \
+GEN_VN_NOEXEC (PREFIX, si##SUFFIX, A(PARAMS), A(ARGS)) \
+GEN_VN_NOEXEC (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS)) \
+GEN_VN_NOEXEC (PREFIX, di##SUFFIX, A(PARAMS), A(ARGS)) \
+GEN_VN_NOEXEC (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS)) \
+static rtx \
+gen_##PREFIX##vNm##SUFFIX (PARAMS) \
+{ \
+  machine_mode mode = GET_MODE_INNER (GET_MODE (dest)); \
+  \
+  switch (mode) \
+    { \
+    case E_QImode: return gen_##PREFIX##vNqi##SUFFIX (ARGS); \
+    case E_HImode: return gen_##PREFIX##vNhi##SUFFIX (ARGS); \
+    case E_HFmode: return gen_##PREFIX##vNhf##SUFFIX (ARGS); \
+    case E_SImode: return gen_##PREFIX##vNsi##SUFFIX (ARGS); \
+    case E_SFmode: return gen_##PREFIX##vNsf##SUFFIX (ARGS); \
+    case E_DImode: return gen_##PREFIX##vNdi##SUFFIX (ARGS); \
+    case E_DFmode: return gen_##PREFIX##vNdf##SUFFIX (ARGS); \
+    default: \
+      break; \
+    } \
+  \
+  gcc_unreachable (); \
+  return NULL_RTX; \
+}
+
+#define GEN_VN(PREFIX, SUFFIX, PARAMS, ARGS) \
+static rtx \
+gen_##PREFIX##vN##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \
+{ \
+  machine_mode mode = GET_MODE (dest); \
+  int n = GET_MODE_NUNITS (mode); \
+  \
+  if (exec && !merge_src) \
+	merge_src = gcn_gen_undef (mode); \
+      \
+  if (exec) \
+    switch (n) \
+      { \
+      case 2: return gen_##PREFIX##v2##SUFFIX##_exec (ARGS, merge_src, exec); \
+      case 4: return gen_##PREFIX##v4##SUFFIX##_exec (ARGS, merge_src, exec); \
+      case 8: return gen_##PREFIX##v8##SUFFIX##_exec (ARGS, merge_src, exec); \
+      case 16: return gen_##PREFIX##v16##SUFFIX##_exec (ARGS, merge_src, exec); \
+      case 32: return gen_##PREFIX##v32##SUFFIX##_exec (ARGS, merge_src, exec); \
+      case 64: return gen_##PREFIX##v64##SUFFIX##_exec (ARGS, merge_src, exec); \
+      } \
+  else \
+    switch (n) \
+      { \
+      case 2: return gen_##PREFIX##v2##SUFFIX (ARGS); \
+      case 4: return gen_##PREFIX##v4##SUFFIX (ARGS); \
+      case 8: return gen_##PREFIX##v8##SUFFIX (ARGS); \
+      case 16: return gen_##PREFIX##v16##SUFFIX (ARGS); \
+      case 32: return gen_##PREFIX##v32##SUFFIX (ARGS); \
+      case 64: return gen_##PREFIX##v64##SUFFIX (ARGS); \
+      } \
+  \
+  gcc_unreachable (); \
+  return NULL_RTX; \
+}
+
+#define GEN_VNM(PREFIX, SUFFIX, PARAMS, ARGS) \
+GEN_VN (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS)) \
+GEN_VN (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS)) \
+GEN_VN (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS)) \
+GEN_VN (PREFIX, si##SUFFIX, A(PARAMS), A(ARGS)) \
+GEN_VN (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS)) \
+GEN_VN (PREFIX, di##SUFFIX, A(PARAMS), A(ARGS)) \
+GEN_VN (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS)) \
+static rtx \
+gen_##PREFIX##vNm##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \
+{ \
+  machine_mode mode = GET_MODE_INNER (GET_MODE (dest)); \
+  \
+  switch (mode) \
+    { \
+    case E_QImode: return gen_##PREFIX##vNqi##SUFFIX (ARGS, merge_src, exec); \
+    case E_HImode: return gen_##PREFIX##vNhi##SUFFIX (ARGS, merge_src, exec); \
+    case E_HFmode: return gen_##PREFIX##vNhf##SUFFIX (ARGS, merge_src, exec); \
+    case E_SImode: return gen_##PREFIX##vNsi##SUFFIX (ARGS, merge_src, exec); \
+    case E_SFmode: return gen_##PREFIX##vNsf##SUFFIX (ARGS, merge_src, exec); \
+    case E_DImode: return gen_##PREFIX##vNdi##SUFFIX (ARGS, merge_src, exec); \
+    case E_DFmode: return gen_##PREFIX##vNdf##SUFFIX (ARGS, merge_src, exec); \
+    default: \
+      break; \
+    } \
+  \
+  gcc_unreachable (); \
+  return NULL_RTX; \
+}
+
+GEN_VNM (add,3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
+GEN_VN (add,si3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
+GEN_VN (add,si3_vcc_dup, A(rtx dest, rtx src1, rtx src2, rtx vcc),
+	A(dest, src1, src2, vcc))
+GEN_VN (add,di3_sext_dup2, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
+GEN_VN (add,di3_vcc_zext_dup, A(rtx dest, rtx src1, rtx src2, rtx vcc),
+	A(dest, src1, src2, vcc))
+GEN_VN (add,di3_zext_dup2, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
+GEN_VN (add,di3_vcc_zext_dup2, A(rtx dest, rtx src1, rtx src2, rtx vcc),
+	A(dest, src1, src2, vcc))
+GEN_VN (addc,si3, A(rtx dest, rtx src1, rtx src2, rtx vccout, rtx vccin),
+	A(dest, src1, src2, vccout, vccin))
+GEN_VN (ashl,si3, A(rtx dest, rtx src, rtx shift), A(dest, src, shift))
+GEN_VNM_NOEXEC (ds_bpermute,, A(rtx dest, rtx addr, rtx src, rtx exec),
+		A(dest, addr, src, exec))
+GEN_VNM (mov,, A(rtx dest, rtx src), A(dest, src))
+GEN_VN (mul,si3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
+GEN_VNM (vec_duplicate,, A(rtx dest, rtx src), A(dest, src))
+
+#undef GEN_VNM
+#undef GEN_VN
+#undef GET_VN_FN
+#undef A
+
+/* Get icode for vector instructions without an optab.  */
+
+#define CODE_FOR(PREFIX, SUFFIX) \
+static int \
+get_code_for_##PREFIX##vN##SUFFIX (int nunits) \
+{ \
+  switch (nunits) \
+    { \
+    case 2: return CODE_FOR_##PREFIX##v2##SUFFIX; \
+    case 4: return CODE_FOR_##PREFIX##v4##SUFFIX; \
+    case 8: return CODE_FOR_##PREFIX##v8##SUFFIX; \
+    case 16: return CODE_FOR_##PREFIX##v16##SUFFIX; \
+    case 32: return CODE_FOR_##PREFIX##v32##SUFFIX; \
+    case 64: return CODE_FOR_##PREFIX##v64##SUFFIX; \
+    } \
+  \
+  gcc_unreachable (); \
+  return CODE_FOR_nothing; \
+}
+
+#define CODE_FOR_OP(PREFIX) \
+	  CODE_FOR (PREFIX, qi) \
+	CODE_FOR (PREFIX, hi) \
+	CODE_FOR (PREFIX, hf) \
+	CODE_FOR (PREFIX, si) \
+	CODE_FOR (PREFIX, sf) \
+	CODE_FOR (PREFIX, di) \
+	CODE_FOR (PREFIX, df) \
+static int \
+get_code_for_##PREFIX (machine_mode mode) \
+{ \
+  int vf = GET_MODE_NUNITS (mode); \
+  machine_mode smode = GET_MODE_INNER (mode); \
+  \
+  switch (smode) \
+    { \
+    case E_QImode: return get_code_for_##PREFIX##vNqi (vf); \
+    case E_HImode: return get_code_for_##PREFIX##vNhi (vf); \
+    case E_HFmode: return get_code_for_##PREFIX##vNhf (vf); \
+    case E_SImode: return get_code_for_##PREFIX##vNsi (vf); \
+    case E_SFmode: return get_code_for_##PREFIX##vNsf (vf); \
+    case E_DImode: return get_code_for_##PREFIX##vNdi (vf); \
+    case E_DFmode: return get_code_for_##PREFIX##vNdf (vf); \
+    default: break; \
+    } \
+  \
+  gcc_unreachable (); \
+  return CODE_FOR_nothing; \
+}
+
+CODE_FOR_OP (reload_in)
+CODE_FOR_OP (reload_out)
+
+#undef CODE_FOR_OP
+#undef CODE_FOR
+
 /* }}}  */
 /* {{{ Addresses, pointers and moves.  */
 
@@ -1644,60 +1980,6 @@  regno_ok_for_index_p (int regno)
   return regno == M0_REG || VGPR_REGNO_P (regno);
 }
 
-/* Generate move which uses the exec flags.  If EXEC is NULL, then it is
-   assumed that all lanes normally relevant to the mode of the move are
-   affected.  If PREV is NULL, then a sensible default is supplied for
-   the inactive lanes.  */
-
-static rtx
-gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL)
-{
-  machine_mode mode = GET_MODE (op0);
-
-  if (vgpr_vector_mode_p (mode))
-    {
-      if (exec && exec != CONSTM1_RTX (DImode))
-	{
-	  if (!prev)
-	    prev = op0;
-	}
-      else
-	{
-	  if (!prev)
-	    prev = gcn_gen_undef (mode);
-	  exec = gcn_full_exec_reg ();
-	}
-
-      rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec));
-
-      return gen_rtx_PARALLEL (VOIDmode,
-	       gen_rtvec (2, set,
-			 gen_rtx_CLOBBER (VOIDmode,
-					  gen_rtx_SCRATCH (V64DImode))));
-    }
-
-  return (gen_rtx_PARALLEL
-	  (VOIDmode,
-	   gen_rtvec (2, gen_rtx_SET (op0, op1),
-		      gen_rtx_USE (VOIDmode,
-				   exec ? exec : gcn_scalar_exec ()))));
-}
-
-/* Generate masked move.  */
-
-static rtx
-gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL)
-{
-  if (exec)
-    return (gen_rtx_SET (op0,
-			 gen_rtx_VEC_MERGE (GET_MODE (op0),
-					    gen_rtx_VEC_DUPLICATE (GET_MODE
-								   (op0), op1),
-					    op2, exec)));
-  else
-    return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1)));
-}
-
 /* Expand vector init of OP0 by VEC.
    Implements vec_init instruction pattern.  */
 
@@ -1707,10 +1989,11 @@  gcn_expand_vector_init (rtx op0, rtx vec)
   int64_t initialized_mask = 0;
   int64_t curr_mask = 1;
   machine_mode mode = GET_MODE (op0);
+  int vf = GET_MODE_NUNITS (mode);
 
   rtx val = XVECEXP (vec, 0, 0);
 
-  for (int i = 1; i < 64; i++)
+  for (int i = 1; i < vf; i++)
     if (rtx_equal_p (val, XVECEXP (vec, 0, i)))
       curr_mask |= (int64_t) 1 << i;
 
@@ -1719,26 +2002,26 @@  gcn_expand_vector_init (rtx op0, rtx vec)
   else
     {
       val = force_reg (GET_MODE_INNER (mode), val);
-      emit_insn (gen_duplicate_load (op0, val));
+      emit_insn (gen_vec_duplicatevNm (op0, val));
     }
   initialized_mask |= curr_mask;
-  for (int i = 1; i < 64; i++)
+  for (int i = 1; i < vf; i++)
     if (!(initialized_mask & ((int64_t) 1 << i)))
       {
 	curr_mask = (int64_t) 1 << i;
 	rtx val = XVECEXP (vec, 0, i);
 
-	for (int j = i + 1; j < 64; j++)
+	for (int j = i + 1; j < vf; j++)
 	  if (rtx_equal_p (val, XVECEXP (vec, 0, j)))
 	    curr_mask |= (int64_t) 1 << j;
 	if (gcn_constant_p (val))
-	  emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val),
-					get_exec (curr_mask)));
+	  emit_insn (gen_movvNm (op0, gcn_vec_constant (mode, val), op0,
+				 get_exec (curr_mask)));
 	else
 	  {
 	    val = force_reg (GET_MODE_INNER (mode), val);
-	    emit_insn (gen_duplicate_load (op0, val, op0,
-					   get_exec (curr_mask)));
+	    emit_insn (gen_vec_duplicatevNm (op0, val, op0,
+					     get_exec (curr_mask)));
 	  }
 	initialized_mask |= curr_mask;
       }
@@ -1751,18 +2034,18 @@  strided_constant (machine_mode mode, int base, int val)
 {
   rtx x = gen_reg_rtx (mode);
   emit_move_insn (x, gcn_vec_constant (mode, base));
-  emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32),
-				 x, get_exec (0xffffffff00000000)));
-  emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16),
-				 x, get_exec (0xffff0000ffff0000)));
-  emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8),
-				 x, get_exec (0xff00ff00ff00ff00)));
-  emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4),
-				 x, get_exec (0xf0f0f0f0f0f0f0f0)));
-  emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2),
-				 x, get_exec (0xcccccccccccccccc)));
-  emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1),
-				 x, get_exec (0xaaaaaaaaaaaaaaaa)));
+  emit_insn (gen_addvNm3 (x, x, gcn_vec_constant (mode, val * 32),
+			  x, get_exec (0xffffffff00000000)));
+  emit_insn (gen_addvNm3 (x, x, gcn_vec_constant (mode, val * 16),
+			  x, get_exec (0xffff0000ffff0000)));
+  emit_insn (gen_addvNm3 (x, x, gcn_vec_constant (mode, val * 8),
+			  x, get_exec (0xff00ff00ff00ff00)));
+  emit_insn (gen_addvNm3 (x, x, gcn_vec_constant (mode, val * 4),
+			  x, get_exec (0xf0f0f0f0f0f0f0f0)));
+  emit_insn (gen_addvNm3 (x, x, gcn_vec_constant (mode, val * 2),
+			  x, get_exec (0xcccccccccccccccc)));
+  emit_insn (gen_addvNm3 (x, x, gcn_vec_constant (mode, val * 1),
+			  x, get_exec (0xaaaaaaaaaaaaaaaa)));
   return x;
 }
 
@@ -1792,15 +2075,17 @@  gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
     case ADDR_SPACE_LDS:
     case ADDR_SPACE_GDS:
       /* FIXME: LDS support offsets, handle them!.  */
-      if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode)
+      if (vgpr_vector_mode_p (mode)
+	  && GET_MODE_INNER (GET_MODE (x)) != SImode)
 	{
-	  rtx addrs = gen_reg_rtx (V64SImode);
+	  machine_mode simode = VnMODE (GET_MODE_NUNITS (mode), SImode);
+	  rtx addrs = gen_reg_rtx (simode);
 	  rtx base = force_reg (SImode, x);
-	  rtx offsets = strided_constant (V64SImode, 0,
+	  rtx offsets = strided_constant (simode, 0,
 					  GET_MODE_UNIT_SIZE (mode));
 
-	  emit_insn (gen_vec_duplicatev64si (addrs, base));
-	  emit_insn (gen_addv64si3 (addrs, offsets, addrs));
+	  emit_insn (gen_vec_duplicatevNsi (addrs, base));
+	  emit_insn (gen_addvNsi3 (addrs, offsets, addrs));
 	  return addrs;
 	}
       return x;
@@ -1808,16 +2093,18 @@  gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
   gcc_unreachable ();
 }
 
-/* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the
+/* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:VnDI)) with the
    proper vector of stepped addresses.
 
    MEM will be a DImode address of a vector in an SGPR.
-   TMP will be a V64DImode VGPR pair or (scratch:V64DI).  */
+   TMP will be a VnDImode VGPR pair or (scratch:VnDI).  */
 
 rtx
 gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
 				     rtx tmp)
 {
+  machine_mode pmode = VnMODE (GET_MODE_NUNITS (mode), DImode);
+  machine_mode offmode = VnMODE (GET_MODE_NUNITS (mode), SImode);
   gcc_assert (MEM_P (mem));
   rtx mem_base = XEXP (mem, 0);
   rtx mem_index = NULL_RTX;
@@ -1841,22 +2128,18 @@  gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
 
   machine_mode inner = GET_MODE_INNER (mode);
   int shift = exact_log2 (GET_MODE_SIZE (inner));
-  rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
-  rtx undef_v64si = gcn_gen_undef (V64SImode);
+  rtx ramp = gen_rtx_REG (offmode, VGPR_REGNO (1));
   rtx new_base = NULL_RTX;
   addr_space_t as = MEM_ADDR_SPACE (mem);
 
   rtx tmplo = (REG_P (tmp)
-	       ? gcn_operand_part (V64DImode, tmp, 0)
-	       : gen_reg_rtx (V64SImode));
+	       ? gcn_operand_part (pmode, tmp, 0)
+	       : gen_reg_rtx (offmode));
 
   /* tmplo[:] = ramp[:] << shift  */
-  if (exec)
-    emit_insn (gen_ashlv64si3_exec (tmplo, ramp,
-				    gen_int_mode (shift, SImode),
-				    undef_v64si, exec));
-  else
-    emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode)));
+  emit_insn (gen_ashlvNsi3 (tmplo, ramp,
+			    gen_int_mode (shift, SImode),
+			    NULL, exec));
 
   if (AS_FLAT_P (as))
     {
@@ -1866,53 +2149,41 @@  gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
 	{
 	  rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
 	  rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
-	  rtx tmphi = gcn_operand_part (V64DImode, tmp, 1);
+	  rtx tmphi = gcn_operand_part (pmode, tmp, 1);
 
 	  /* tmphi[:] = mem_base_hi  */
-	  if (exec)
-	    emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi,
-						    undef_v64si, exec));
-	  else
-	    emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi));
+	  emit_insn (gen_vec_duplicatevNsi (tmphi, mem_base_hi, NULL, exec));
 
 	  /* tmp[:] += zext (mem_base)  */
 	  if (exec)
 	    {
-	      emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo,
-						     vcc, undef_v64si, exec));
-	      emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx,
-					      vcc, vcc, undef_v64si, exec));
+	      emit_insn (gen_addvNsi3_vcc_dup (tmplo, mem_base_lo, tmplo,
+					       vcc, NULL, exec));
+	      emit_insn (gen_addcvNsi3 (tmphi, tmphi, const0_rtx,
+				        vcc, vcc, NULL, exec));
 	    }
 	  else
-	    emit_insn (gen_addv64di3_vcc_zext_dup (tmp, mem_base_lo, tmp, vcc));
+	    emit_insn (gen_addvNdi3_vcc_zext_dup (tmp, mem_base_lo, tmp, vcc));
 	}
       else
 	{
-	  tmp = gen_reg_rtx (V64DImode);
-	  if (exec)
-	    emit_insn (gen_addv64di3_vcc_zext_dup2_exec
-		       (tmp, tmplo, mem_base, vcc, gcn_gen_undef (V64DImode),
-			exec));
-	  else
-	    emit_insn (gen_addv64di3_vcc_zext_dup2 (tmp, tmplo, mem_base, vcc));
+	  tmp = gen_reg_rtx (pmode);
+	  emit_insn (gen_addvNdi3_vcc_zext_dup2 (tmp, tmplo, mem_base, vcc,
+						 NULL, exec));
 	}
 
       new_base = tmp;
     }
   else if (AS_ANY_DS_P (as))
     {
-      if (!exec)
-	emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base));
-      else
-        emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base,
-					   gcn_gen_undef (V64SImode), exec));
+      emit_insn (gen_addvNsi3_dup (tmplo, tmplo, mem_base, NULL, exec));
       new_base = tmplo;
     }
   else
     {
-      mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base);
-      new_base = gen_rtx_PLUS (V64DImode, mem_base,
-			       gen_rtx_SIGN_EXTEND (V64DImode, tmplo));
+      mem_base = gen_rtx_VEC_DUPLICATE (pmode, mem_base);
+      new_base = gen_rtx_PLUS (pmode, mem_base,
+			       gen_rtx_SIGN_EXTEND (pmode, tmplo));
     }
 
   return gen_rtx_PLUS (GET_MODE (new_base), new_base,
@@ -1929,42 +2200,33 @@  gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
    If EXEC is set then _exec patterns will be used, otherwise plain.
 
    Return values.
-     ADDR_SPACE_FLAT   - return V64DImode vector of absolute addresses.
-     ADDR_SPACE_GLOBAL - return V64SImode vector of offsets.  */
+     ADDR_SPACE_FLAT   - return VnDImode vector of absolute addresses.
+     ADDR_SPACE_GLOBAL - return VnSImode vector of offsets.  */
 
 rtx
 gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
 			   bool unsigned_p, rtx exec)
 {
-  rtx tmpsi = gen_reg_rtx (V64SImode);
-  rtx tmpdi = gen_reg_rtx (V64DImode);
-  rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL;
-  rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL;
+  int vf = GET_MODE_NUNITS (GET_MODE (offsets));
+  rtx tmpsi = gen_reg_rtx (VnMODE (vf, SImode));
+  rtx tmpdi = gen_reg_rtx (VnMODE (vf, DImode));
 
   if (CONST_INT_P (scale)
       && INTVAL (scale) > 0
       && exact_log2 (INTVAL (scale)) >= 0)
-    emit_insn (gen_ashlv64si3 (tmpsi, offsets,
-			       GEN_INT (exact_log2 (INTVAL (scale)))));
+    emit_insn (gen_ashlvNsi3 (tmpsi, offsets,
+			      GEN_INT (exact_log2 (INTVAL (scale))),
+			      NULL, exec));
   else
-    (exec
-     ? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi,
-					  exec))
-     : emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale)));
+     emit_insn (gen_mulvNsi3_dup (tmpsi, offsets, scale, NULL, exec));
 
   /* "Global" instructions do not support negative register offsets.  */
   if (as == ADDR_SPACE_FLAT || !unsigned_p)
     {
       if (unsigned_p)
-	(exec
-	 ?  emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base,
-						    undefdi, exec))
-	 :  emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base)));
+	 emit_insn (gen_addvNdi3_zext_dup2 (tmpdi, tmpsi, base, NULL, exec));
       else
-	(exec
-	 ?  emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base,
-						     undefdi, exec))
-	 :  emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base)));
+	 emit_insn (gen_addvNdi3_sext_dup2 (tmpdi, tmpsi, base, NULL, exec));
       return tmpdi;
     }
   else if (as == ADDR_SPACE_GLOBAL)
@@ -2065,59 +2327,9 @@  gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
 	      || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
 	    {
 	      if (in_p)
-		switch (reload_mode)
-		  {
-		  case E_V64SImode:
-		    sri->icode = CODE_FOR_reload_inv64si;
-		    break;
-		  case E_V64SFmode:
-		    sri->icode = CODE_FOR_reload_inv64sf;
-		    break;
-		  case E_V64HImode:
-		    sri->icode = CODE_FOR_reload_inv64hi;
-		    break;
-		  case E_V64HFmode:
-		    sri->icode = CODE_FOR_reload_inv64hf;
-		    break;
-		  case E_V64QImode:
-		    sri->icode = CODE_FOR_reload_inv64qi;
-		    break;
-		  case E_V64DImode:
-		    sri->icode = CODE_FOR_reload_inv64di;
-		    break;
-		  case E_V64DFmode:
-		    sri->icode = CODE_FOR_reload_inv64df;
-		    break;
-		  default:
-		    gcc_unreachable ();
-		  }
+		sri->icode = get_code_for_reload_in (reload_mode);
 	      else
-		switch (reload_mode)
-		  {
-		  case E_V64SImode:
-		    sri->icode = CODE_FOR_reload_outv64si;
-		    break;
-		  case E_V64SFmode:
-		    sri->icode = CODE_FOR_reload_outv64sf;
-		    break;
-		  case E_V64HImode:
-		    sri->icode = CODE_FOR_reload_outv64hi;
-		    break;
-		  case E_V64HFmode:
-		    sri->icode = CODE_FOR_reload_outv64hf;
-		    break;
-		  case E_V64QImode:
-		    sri->icode = CODE_FOR_reload_outv64qi;
-		    break;
-		  case E_V64DImode:
-		    sri->icode = CODE_FOR_reload_outv64di;
-		    break;
-		  case E_V64DFmode:
-		    sri->icode = CODE_FOR_reload_outv64df;
-		    break;
-		  default:
-		    gcc_unreachable ();
-		  }
+		sri->icode = get_code_for_reload_out (reload_mode);
 	      break;
 	    }
 	  /* Fallthrough.  */
@@ -3428,6 +3640,9 @@  gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op)
 
   if (VECTOR_MODE_P (from))
     {
+      if (GET_MODE_NUNITS (from) != GET_MODE_NUNITS (to))
+	return false;
+
       from = GET_MODE_INNER (from);
       to = GET_MODE_INNER (to);
     }
@@ -3926,7 +4141,7 @@  gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
 	rtx mem = gen_rtx_MEM (GET_MODE (target), addrs);
 	/*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
 	/* FIXME: set attributes.  */
-	emit_insn (gen_mov_with_exec (target, mem, exec));
+	emit_insn (gen_movvNm (target, mem, NULL, exec));
 	return target;
       }
     case GCN_BUILTIN_FLAT_STORE_PTR_INT32:
@@ -3961,20 +4176,18 @@  gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
 	rtx mem = gen_rtx_MEM (vmode, addrs);
 	/*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
 	/* FIXME: set attributes.  */
-	emit_insn (gen_mov_with_exec (mem, val, exec));
+	emit_insn (gen_movvNm (mem, val, NULL, exec));
 	return target;
       }
     case GCN_BUILTIN_SQRTVF:
       {
 	if (ignore)
 	  return target;
-	rtx exec = gcn_full_exec_reg ();
 	rtx arg = force_reg (V64SFmode,
 			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
 					  V64SFmode,
 					  EXPAND_NORMAL));
-	emit_insn (gen_sqrtv64sf2_exec
-		   (target, arg, gcn_gen_undef (V64SFmode), exec));
+	emit_insn (gen_sqrtv64sf2 (target, arg));
 	return target;
       }
     case GCN_BUILTIN_SQRTF:
@@ -3992,20 +4205,17 @@  gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
       {
 	if (ignore)
 	  return target;
-	rtx exec = gcn_full_exec_reg ();
 	rtx arg = force_reg (V64SFmode,
 			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
 					  V64SFmode,
 					  EXPAND_NORMAL));
-	emit_insn (gen_absv64sf2_exec
-		   (target, arg, gcn_gen_undef (V64SFmode), exec));
+	emit_insn (gen_absv64sf2 (target, arg));
 	return target;
       }
     case GCN_BUILTIN_LDEXPVF:
       {
 	if (ignore)
 	  return target;
-	rtx exec = gcn_full_exec_reg ();
 	rtx arg1 = force_reg (V64SFmode,
 			      expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
 					   V64SFmode,
@@ -4014,15 +4224,13 @@  gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
 			      expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
 					   V64SImode,
 					   EXPAND_NORMAL));
-	emit_insn (gen_ldexpv64sf3_exec
-		   (target, arg1, arg2, gcn_gen_undef (V64SFmode), exec));
+	emit_insn (gen_ldexpv64sf3 (target, arg1, arg2));
 	return target;
       }
     case GCN_BUILTIN_LDEXPV:
       {
 	if (ignore)
 	  return target;
-	rtx exec = gcn_full_exec_reg ();
 	rtx arg1 = force_reg (V64DFmode,
 			      expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
 					   V64SFmode,
@@ -4031,60 +4239,51 @@  gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
 			      expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
 					   V64SImode,
 					   EXPAND_NORMAL));
-	emit_insn (gen_ldexpv64df3_exec
-		   (target, arg1, arg2, gcn_gen_undef (V64DFmode), exec));
+	emit_insn (gen_ldexpv64df3 (target, arg1, arg2));
 	return target;
       }
     case GCN_BUILTIN_FREXPVF_EXP:
       {
 	if (ignore)
 	  return target;
-	rtx exec = gcn_full_exec_reg ();
 	rtx arg = force_reg (V64SFmode,
 			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
 					  V64SFmode,
 					  EXPAND_NORMAL));
-	emit_insn (gen_frexpv64sf_exp2_exec
-		   (target, arg, gcn_gen_undef (V64SImode), exec));
+	emit_insn (gen_frexpv64sf_exp2 (target, arg));
 	return target;
       }
     case GCN_BUILTIN_FREXPVF_MANT:
       {
 	if (ignore)
 	  return target;
-	rtx exec = gcn_full_exec_reg ();
 	rtx arg = force_reg (V64SFmode,
 			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
 					  V64SFmode,
 					  EXPAND_NORMAL));
-	emit_insn (gen_frexpv64sf_mant2_exec
-		   (target, arg, gcn_gen_undef (V64SFmode), exec));
+	emit_insn (gen_frexpv64sf_mant2 (target, arg));
 	return target;
       }
     case GCN_BUILTIN_FREXPV_EXP:
       {
 	if (ignore)
 	  return target;
-	rtx exec = gcn_full_exec_reg ();
 	rtx arg = force_reg (V64DFmode,
 			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
 					  V64DFmode,
 					  EXPAND_NORMAL));
-	emit_insn (gen_frexpv64df_exp2_exec
-		   (target, arg, gcn_gen_undef (V64SImode), exec));
+	emit_insn (gen_frexpv64df_exp2 (target, arg));
 	return target;
       }
     case GCN_BUILTIN_FREXPV_MANT:
       {
 	if (ignore)
 	  return target;
-	rtx exec = gcn_full_exec_reg ();
 	rtx arg = force_reg (V64DFmode,
 			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
 					  V64DFmode,
 					  EXPAND_NORMAL));
-	emit_insn (gen_frexpv64df_mant2_exec
-		   (target, arg, gcn_gen_undef (V64DFmode), exec));
+	emit_insn (gen_frexpv64df_mant2 (target, arg));
 	return target;
       }
     case GCN_BUILTIN_OMP_DIM_SIZE:
@@ -4239,10 +4438,11 @@  gcn_vectorize_get_mask_mode (machine_mode)
    Helper function for gcn_vectorize_vec_perm_const.  */
 
 static rtx
-gcn_make_vec_perm_address (unsigned int *perm)
+gcn_make_vec_perm_address (unsigned int *perm, int nelt)
 {
-  rtx x = gen_reg_rtx (V64SImode);
-  emit_move_insn (x, gcn_vec_constant (V64SImode, 0));
+  machine_mode mode = VnMODE (nelt, SImode);
+  rtx x = gen_reg_rtx (mode);
+  emit_move_insn (x, gcn_vec_constant (mode, 0));
 
   /* Permutation addresses use byte addressing.  With each vector lane being
      4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant,
@@ -4258,15 +4458,13 @@  gcn_make_vec_perm_address (unsigned int *perm)
     {
       uint64_t exec_mask = 0;
       uint64_t lane_mask = 1;
-      for (int j = 0; j < 64; j++, lane_mask <<= 1)
-	if ((perm[j] * 4) & bit_mask)
+      for (int j = 0; j < nelt; j++, lane_mask <<= 1)
+	if (((perm[j] % nelt) * 4) & bit_mask)
 	  exec_mask |= lane_mask;
 
       if (exec_mask)
-	emit_insn (gen_addv64si3_exec (x, x,
-				       gcn_vec_constant (V64SImode,
-							 bit_mask),
-				       x, get_exec (exec_mask)));
+	emit_insn (gen_addvNsi3 (x, x, gcn_vec_constant (mode, bit_mask),
+				 x, get_exec (exec_mask)));
     }
 
   return x;
@@ -4336,39 +4534,11 @@  gcn_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 	src1_lanes |= lane_bit;
     }
 
-  rtx addr = gcn_make_vec_perm_address (perm);
-  rtx (*ds_bpermute) (rtx, rtx, rtx, rtx);
-
-  switch (vmode)
-    {
-    case E_V64QImode:
-      ds_bpermute = gen_ds_bpermutev64qi;
-      break;
-    case E_V64HImode:
-      ds_bpermute = gen_ds_bpermutev64hi;
-      break;
-    case E_V64SImode:
-      ds_bpermute = gen_ds_bpermutev64si;
-      break;
-    case E_V64HFmode:
-      ds_bpermute = gen_ds_bpermutev64hf;
-      break;
-    case E_V64SFmode:
-      ds_bpermute = gen_ds_bpermutev64sf;
-      break;
-    case E_V64DImode:
-      ds_bpermute = gen_ds_bpermutev64di;
-      break;
-    case E_V64DFmode:
-      ds_bpermute = gen_ds_bpermutev64df;
-      break;
-    default:
-      gcc_assert (false);
-    }
+  rtx addr = gcn_make_vec_perm_address (perm, nelt);
 
   /* Load elements from src0 to dst.  */
-  gcc_assert (~src1_lanes);
-  emit_insn (ds_bpermute (dst, addr, src0, gcn_full_exec_reg ()));
+  gcc_assert ((~src1_lanes) & (0xffffffffffffffffUL > (64-nelt)));
+  emit_insn (gen_ds_bpermutevNm (dst, addr, src0, get_exec (vmode)));
 
   /* Load elements from src1 to dst.  */
   if (src1_lanes)
@@ -4379,8 +4549,8 @@  gcn_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
          the two source vectors together.
        */
       rtx tmp = gen_reg_rtx (vmode);
-      emit_insn (ds_bpermute (tmp, addr, src1, gcn_full_exec_reg ()));
-      emit_insn (gen_mov_with_exec (dst, tmp, get_exec (src1_lanes)));
+      emit_insn (gen_ds_bpermutevNm (tmp, addr, src1, get_exec (vmode)));
+      emit_insn (gen_movvNm (dst, tmp, dst, get_exec (src1_lanes)));
     }
 
   return true;
@@ -4396,7 +4566,22 @@  gcn_vector_mode_supported_p (machine_mode mode)
 {
   return (mode == V64QImode || mode == V64HImode
 	  || mode == V64SImode || mode == V64DImode
-	  || mode == V64SFmode || mode == V64DFmode);
+	  || mode == V64SFmode || mode == V64DFmode
+	  || mode == V32QImode || mode == V32HImode
+	  || mode == V32SImode || mode == V32DImode
+	  || mode == V32SFmode || mode == V32DFmode
+	  || mode == V16QImode || mode == V16HImode
+	  || mode == V16SImode || mode == V16DImode
+	  || mode == V16SFmode || mode == V16DFmode
+	  || mode == V8QImode || mode == V8HImode
+	  || mode == V8SImode || mode == V8DImode
+	  || mode == V8SFmode || mode == V8DFmode
+	  || mode == V4QImode || mode == V4HImode
+	  || mode == V4SImode || mode == V4DImode
+	  || mode == V4SFmode || mode == V4DFmode
+	  || mode == V2QImode || mode == V2HImode
+	  || mode == V2SImode || mode == V2DImode
+	  || mode == V2SFmode || mode == V2DFmode);
 }
 
 /* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
@@ -4425,23 +4610,74 @@  gcn_vectorize_preferred_simd_mode (scalar_mode mode)
     }
 }
 
+/* Implement TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES.
+
+   Try all the vector modes.  */
+
+unsigned int gcn_autovectorize_vector_modes (vector_modes *modes,
+					     bool ARG_UNUSED (all))
+{
+  modes->safe_push (V64QImode);
+  modes->safe_push (V64HImode);
+  modes->safe_push (V64SImode);
+  modes->safe_push (V64SFmode);
+  modes->safe_push (V64DImode);
+  modes->safe_push (V64DFmode);
+
+  modes->safe_push (V32QImode);
+  modes->safe_push (V32HImode);
+  modes->safe_push (V32SImode);
+  modes->safe_push (V32SFmode);
+  modes->safe_push (V32DImode);
+  modes->safe_push (V32DFmode);
+
+  modes->safe_push (V16QImode);
+  modes->safe_push (V16HImode);
+  modes->safe_push (V16SImode);
+  modes->safe_push (V16SFmode);
+  modes->safe_push (V16DImode);
+  modes->safe_push (V16DFmode);
+
+  modes->safe_push (V8QImode);
+  modes->safe_push (V8HImode);
+  modes->safe_push (V8SImode);
+  modes->safe_push (V8SFmode);
+  modes->safe_push (V8DImode);
+  modes->safe_push (V8DFmode);
+
+  modes->safe_push (V4QImode);
+  modes->safe_push (V4HImode);
+  modes->safe_push (V4SImode);
+  modes->safe_push (V4SFmode);
+  modes->safe_push (V4DImode);
+  modes->safe_push (V4DFmode);
+
+  modes->safe_push (V2QImode);
+  modes->safe_push (V2HImode);
+  modes->safe_push (V2SImode);
+  modes->safe_push (V2SFmode);
+  modes->safe_push (V2DImode);
+  modes->safe_push (V2DFmode);
+
+  /* We shouldn't need VECT_COMPARE_COSTS as they should all cost the same.  */
+  return 0;
+}
+
 /* Implement TARGET_VECTORIZE_RELATED_MODE.
 
    All GCN vectors are 64-lane, so this is simpler than other architectures.
    In particular, we do *not* want to match vector bit-size.  */
 
 static opt_machine_mode
-gcn_related_vector_mode (machine_mode ARG_UNUSED (vector_mode),
+gcn_related_vector_mode (machine_mode vector_mode,
 			 scalar_mode element_mode, poly_uint64 nunits)
 {
-  if (known_ne (nunits, 0U) && known_ne (nunits, 64U))
-    return VOIDmode;
+  int n = nunits.to_constant ();
 
-  machine_mode pref_mode = gcn_vectorize_preferred_simd_mode (element_mode);
-  if (!VECTOR_MODE_P (pref_mode))
-    return VOIDmode;
+  if (n == 0)
+    n = GET_MODE_NUNITS (vector_mode);
 
-  return pref_mode;
+  return VnMODE (n, element_mode);
 }
 
 /* Implement TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.
@@ -4566,6 +4802,8 @@  gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
    The vector register SRC of mode MODE is reduced using the operation given
    by UNSPEC, and the scalar result is returned in lane 63 of a vector
    register.  */
+/* FIXME: Implement reductions for sizes other than V64.
+          (They're currently disabled in the machine description.)  */
 
 rtx
 gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
@@ -4975,10 +5213,11 @@  gcn_md_reorg (void)
 		      {
 			if (VECTOR_MODE_P (GET_MODE (x)))
 			  {
-			    new_exec = -1;
-			    break;
+			    int vf = GET_MODE_NUNITS (GET_MODE (x));
+			    new_exec = MAX ((uint64_t)new_exec,
+					    0xffffffffffffffffUL >> (64-vf));
 			  }
-			else
+			else if (new_exec == 0)
 			  new_exec = 1;
 		      }
 		  }
@@ -5693,13 +5932,12 @@  static void
 print_reg (FILE *file, rtx x)
 {
   machine_mode mode = GET_MODE (x);
+  if (VECTOR_MODE_P (mode))
+    mode = GET_MODE_INNER (mode);
   if (mode == BImode || mode == QImode || mode == HImode || mode == SImode
-      || mode == HFmode || mode == SFmode
-      || mode == V64SFmode || mode == V64SImode
-      || mode == V64QImode || mode == V64HImode)
+      || mode == HFmode || mode == SFmode)
     fprintf (file, "%s", reg_names[REGNO (x)]);
-  else if (mode == DImode || mode == V64DImode
-	   || mode == DFmode || mode == V64DFmode)
+  else if (mode == DImode || mode == DFmode)
     {
       if (SGPR_REGNO_P (REGNO (x)))
 	fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
@@ -6146,20 +6384,20 @@  print_operand (FILE *file, rtx x, int code)
     case 'o':
       {
 	const char *s = 0;
-	switch (GET_MODE_SIZE (GET_MODE (x)))
+	machine_mode mode = GET_MODE (x);
+	if (VECTOR_MODE_P (mode))
+	  mode = GET_MODE_INNER (mode);
+
+	switch (mode)
 	  {
-	  case 1:
+	  case E_QImode:
 	    s = "_ubyte";
 	    break;
-	  case 2:
+	  case E_HImode:
+	  case E_HFmode:
 	    s = "_ushort";
 	    break;
-	  /* The following are full-vector variants.  */
-	  case 64:
-	    s = "_ubyte";
-	    break;
-	  case 128:
-	    s = "_ushort";
+	  default:
 	    break;
 	  }
 
@@ -6174,43 +6412,31 @@  print_operand (FILE *file, rtx x, int code)
       }
     case 's':
       {
-	const char *s = "";
-	switch (GET_MODE_SIZE (GET_MODE (x)))
+	const char *s;
+	machine_mode mode = GET_MODE (x);
+	if (VECTOR_MODE_P (mode))
+	  mode = GET_MODE_INNER (mode);
+
+	switch (mode)
 	  {
-	  case 1:
+	  case E_QImode:
 	    s = "_byte";
 	    break;
-	  case 2:
+	  case E_HImode:
+	  case E_HFmode:
 	    s = "_short";
 	    break;
-	  case 4:
+	  case E_SImode:
+	  case E_SFmode:
 	    s = "_dword";
 	    break;
-	  case 8:
+	  case E_DImode:
+	  case E_DFmode:
 	    s = "_dwordx2";
 	    break;
-	  case 12:
-	    s = "_dwordx3";
-	    break;
-	  case 16:
+	  case E_TImode:
 	    s = "_dwordx4";
 	    break;
-	  case 32:
-	    s = "_dwordx8";
-	    break;
-	  case 64:
-	    s = VECTOR_MODE_P (GET_MODE (x)) ? "_byte" : "_dwordx16";
-	    break;
-	  /* The following are full-vector variants.  */
-	  case 128:
-	    s = "_short";
-	    break;
-	  case 256:
-	    s = "_dword";
-	    break;
-	  case 512:
-	    s = "_dwordx2";
-	    break;
 	  default:
 	    output_operand_lossage ("invalid operand %%xn code");
 	    return;
@@ -6714,6 +6940,9 @@  gcn_dwarf_register_span (rtx rtl)
 #define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template
 #undef  TARGET_ATTRIBUTE_TABLE
 #define TARGET_ATTRIBUTE_TABLE gcn_attribute_table
+#undef  TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
+#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
+  gcn_autovectorize_vector_modes
 #undef  TARGET_BUILTIN_DECL
 #define TARGET_BUILTIN_DECL gcn_builtin_decl
 #undef  TARGET_CAN_CHANGE_MODE_CLASS