diff mbox series

Support vec_set/vec_extract/vec_init for V4HF/V2HF.

Message ID	20231110033900.246872-1-hongtao.liu@intel.com
State	Unresolved
Headers	Received-SPF: pass (google.com: domain of gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org designates 2620:52:3:1:0:246e:9693:128c as permitted sender) client-ip=2620:52:3:1:0:246e:9693:128c; DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 285803858D33 From: liuhongt <hongtao.liu@intel.com> To: gcc-patches@gcc.gnu.org Cc: crazylht@gmail.com, hjl.tools@gmail.com Subject: [PATCH] Support vec_set/vec_extract/vec_init for V4HF/V2HF. Date: Fri, 10 Nov 2023 11:39:00 +0800 Message-Id: <20231110033900.246872-1-hongtao.liu@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list Errors-To: gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org X-getmail-retrieved-from-mailbox: INBOX
Series	Support vec_set/vec_extract/vec_init for V4HF/V2HF. \| Support vec_set/vec_extract/vec_init for V4HF/V2HF.

Checks

Context	Check	Description
snail/gcc-patch-check	warning	Git am fail log

Commit Message

liuhongt Nov. 10, 2023, 3:39 a.m. UTC

  Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

	* config/i386/i386-expand.cc
	(ix86_expand_vector_init_duplicate): Handle V4HF/V4BF and
	V2HF/V2BF.
	(ix86_expand_vector_init_one_nonzero): Ditto.
	(ix86_expand_vector_init_one_var): Ditto.
	(ix86_expand_vector_init_general): Ditto.
	(ix86_expand_vector_set_var): Ditto.
	(ix86_expand_vector_set): Ditto.
	(ix86_expand_vector_extract): Ditto.
	* config/i386/mmx.md
	(mmxdoublevecmode): Extend to V4HF/V4BF/V2HF/V2BF.
	(*mmx_pinsrw): Extend to V4FI_64, add a new alternative (&x,
	x, x), add a new define_split after the pattern.
	(*mmx_pextrw<mode>): New define_insn.
	(mmx_pshufw_1): Rename to ..
	(mmx_pshufw<mode>_1): .. this, extend to V4FI_64.
	(*mmx_pblendw64): Extend to V4FI_64.
	(*vec_dup<mode>): New define_insn.
	(vec_setv4hi): Rename to ..
	(vec_set<mode>): .. this, and extend to V4FI_64
	(vec_extractv4hihi): Rename to ..
	(vec_extract<mode><mmxscalarmodelower>): .. this, and extend
	to V4FI_64.
	(vec_init<mode><mmxscalarmodelower>): New define_insn.
	(*pinsrw): Extend to V2FI_32, add a new alternative (&x,
	x, x), and add a new define_split after it.
	(*pextrw<mode>): New define_insn.
	(vec_setv2hi): Rename to ..
	(vec_set<mode>): .. this, extend to V2FI_32.
	(vec_extractv2hihi): Rename to ..
	(vec_extract<mode><mmxscalarmodelower>): .. this, extend to
	V2FI_32.
	(*punpckwd): Extend to V2FI_32.
	(*pshufw_1): Rename to ..
	(*pshufw<mode>_1): .. this, extend to V2FI_32.
	(vec_initv2hihi): Rename to ..
	(vec_init<mode><mmxscalarmodelower>): .. this, and extend to
	V2FI_32.
	(*vec_dup<mode>): New define_insn.
	* config/i386/sse.md (*vec_extract<mode>): Refine constraint
	from v to Yw.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/part-vect-vec_elem-1.c: New test.
	* gcc.target/i386/part-vect-vec_elem-2.c: New test.
---
 gcc/config/i386/i386-expand.cc                |  60 ++++
 gcc/config/i386/mmx.md                        | 271 ++++++++++++++----
 gcc/config/i386/sse.md                        |   4 +-
 .../gcc.target/i386/part-vect-vec_elem-1.c    | 135 +++++++++
 .../gcc.target/i386/part-vect-vec_elem-2.c    | 135 +++++++++
 5 files changed, 541 insertions(+), 64 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-vec_elem-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-vec_elem-2.c

diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 8fad73c1549..b52ec51fbe4 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -15592,6 +15592,17 @@  ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 	}
       goto widen;
 
+    case E_V4HFmode:
+    case E_V4BFmode:
+      if (TARGET_MMX_WITH_SSE)
+	{
+	  val = force_reg (GET_MODE_INNER (mode), val);
+	  rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
+	  emit_insn (gen_rtx_SET (target, x));
+	  return true;
+	}
+      return false;
+
     case E_V2HImode:
       if (TARGET_SSE2)
 	{
@@ -15605,6 +15616,17 @@  ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 	}
       return false;
 
+    case E_V2HFmode:
+    case E_V2BFmode:
+      if (TARGET_SSE2)
+	{
+	  val = force_reg (GET_MODE_INNER (mode), val);
+	  rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
+	  emit_insn (gen_rtx_SET (target, x));
+	  return true;
+	}
+      return false;
+
     case E_V8QImode:
     case E_V4QImode:
       if (!mmx_ok)
@@ -15815,6 +15837,8 @@  ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
       use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
       break;
     case E_V4HImode:
+    case E_V4HFmode:
+    case E_V4BFmode:
       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
       break;
     case E_V4QImode:
@@ -16051,6 +16075,8 @@  ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
     case E_V4SImode:
     case E_V8HImode:
     case E_V4HImode:
+    case E_V4HFmode:
+    case E_V4BFmode:
       break;
 
     case E_V16QImode:
@@ -16438,6 +16464,7 @@  ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
   rtx ops[64], op0, op1, op2, op3, op4, op5;
   machine_mode half_mode = VOIDmode;
   machine_mode quarter_mode = VOIDmode;
+  machine_mode int_inner_mode = VOIDmode;
   int n, i;
 
   switch (mode)
@@ -16582,6 +16609,13 @@  quarter:
       ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
       return;
 
+    case E_V4HFmode:
+    case E_V4BFmode:
+    case E_V2HFmode:
+    case E_V2BFmode:
+      int_inner_mode = HImode;
+      break;
+
     case E_V4HImode:
     case E_V8QImode:
 
@@ -16613,6 +16647,16 @@  quarter:
 	  for (j = 0; j < n_elt_per_word; ++j)
 	    {
 	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
+	      if (int_inner_mode != E_VOIDmode)
+		{
+		  gcc_assert (TARGET_SSE2 && int_inner_mode == HImode);
+		  rtx tmp = gen_reg_rtx (int_inner_mode);
+		  elt = lowpart_subreg (int_inner_mode,
+					force_reg (inner_mode, elt),
+					inner_mode);
+		  emit_move_insn (tmp, elt);
+		  elt = tmp;
+		}
 	      elt = convert_modes (tmp_mode, inner_mode, elt, true);
 
 	      if (j == 0)
@@ -16839,6 +16883,14 @@  ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
 	case E_V16SFmode:
 	  cmp_mode = V16SImode;
 	  break;
+	case E_V2HFmode:
+	case E_V2BFmode:
+	  cmp_mode = V2HImode;
+	  break;
+	case E_V4HFmode:
+	case E_V4BFmode:
+	  cmp_mode = V4HImode;
+	  break;
 	case E_V8HFmode:
 	  cmp_mode = V8HImode;
 	  break;
@@ -17085,9 +17137,13 @@  ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
     case E_V8HFmode:
     case E_V8BFmode:
     case E_V2HImode:
+    case E_V2HFmode:
+    case E_V2BFmode:
       use_vec_merge = TARGET_SSE2;
       break;
     case E_V4HImode:
+    case E_V4HFmode:
+    case E_V4BFmode:
       use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
       break;
 
@@ -17428,9 +17484,13 @@  ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
     case E_V8HFmode:
     case E_V8BFmode:
     case E_V2HImode:
+    case E_V2HFmode:
+    case E_V2BFmode:
       use_vec_extr = TARGET_SSE2;
       break;
     case E_V4HImode:
+    case E_V4HFmode:
+    case E_V4BFmode:
       use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
       break;
 
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 50402b5b544..a3d08bb9d3b 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -112,11 +112,21 @@  (define_mode_attr mmxintvecmodelower
 
 ;; Mapping of vector modes to a vector mode of double size
 (define_mode_attr mmxdoublevecmode
-  [(V2SF "V4SF") (V2SI "V4SI") (V4HF "V8HF") (V4HI "V8HI")])
+  [(V2SF "V4SF") (V2SI "V4SI") (V4HF "V8HF") (V4HI "V8HI")
+   (V2HI "V4HI") (V2HF "V4HF") (V2BF "V4BF")])
 
 ;; Mapping of vector modes back to the scalar modes
 (define_mode_attr mmxscalarmode
-  [(V2SI "SI") (V2SF "SF")])
+  [(V2SI "SI") (V2SF "SF")
+   (V4HF "HF") (V4BF "BF")
+   (V2HF "HF") (V2BF "BF")
+   (V4HI "HI") (V2HI "HI")])
+
+(define_mode_attr mmxscalarmodelower
+  [(V2SI "si") (V2SF "sf")
+   (V4HF "hf") (V4BF "bf")
+   (V2HF "hf") (V2BF "bf")
+   (V4HI "hi") (V2HI "hi")])
 
 (define_mode_attr Yv_Yw
   [(V8QI "Yw") (V4HI "Yw") (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")])
@@ -4882,11 +4892,11 @@  (define_insn "*mmx_pinsrd"
    (set_attr "mode" "TI")])
 
 (define_insn "*mmx_pinsrw"
-  [(set (match_operand:V4HI 0 "register_operand" "=y,x,YW")
-        (vec_merge:V4HI
-          (vec_duplicate:V4HI
-            (match_operand:HI 2 "nonimmediate_operand" "rm,rm,rm"))
-	  (match_operand:V4HI 1 "register_operand" "0,0,YW")
+  [(set (match_operand:V4FI_64 0 "register_operand" "=y,x,YW,&x")
+        (vec_merge:V4FI_64
+          (vec_duplicate:V4FI_64
+            (match_operand:<mmxscalarmode> 2 "nonimmediate_operand" "rm,rm,rm,x"))
+	  (match_operand:V4FI_64 1 "register_operand" "0,0,YW,x")
           (match_operand:SI 3 "const_int_operand")))]
   "(TARGET_MMX || TARGET_MMX_WITH_SSE)
    && (TARGET_SSE || TARGET_3DNOW_A)
@@ -4896,6 +4906,8 @@  (define_insn "*mmx_pinsrw"
   operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])));
   switch (which_alternative)
     {
+    case 3:
+      return "#";
     case 2:
       if (MEM_P (operands[2]))
 	return "vpinsrw\t{%3, %2, %1, %0|%0, %1, %2, %3}";
@@ -4911,11 +4923,28 @@  (define_insn "*mmx_pinsrw"
       gcc_unreachable ();
     }
 }
-  [(set_attr "isa" "*,sse2_noavx,avx")
-   (set_attr "mmx_isa" "native,*,*")
-   (set_attr "type" "mmxcvt,sselog,sselog")
+  [(set_attr "isa" "*,sse2_noavx,avx,sse4")
+   (set_attr "mmx_isa" "native,*,*,*")
+   (set_attr "type" "mmxcvt,sselog,sselog,sselog")
    (set_attr "length_immediate" "1")
-   (set_attr "mode" "DI,TI,TI")])
+   (set_attr "mode" "DI,TI,TI,TI")])
+
+;; For TARGET_SSE2, implement insert from XMM reg with PSHULFW + PBLENDW.
+(define_split
+  [(set (match_operand:V4FI_64 0 "sse_reg_operand")
+	(vec_merge:V4FI_64
+	  (vec_duplicate:V4FI_64
+	    (match_operand:<mmxscalarmode> 2 "sse_reg_operand"))
+	  (match_operand:V4FI_64 1 "sse_reg_operand")
+	  (match_operand:SI 3 "const_int_operand")))]
+  "TARGET_MMX_WITH_SSE && TARGET_SSE4_1 && reload_completed
+   && ((unsigned) exact_log2 (INTVAL (operands[3]))
+       < GET_MODE_NUNITS (<MODE>mode))"
+  [(set (match_dup 0)
+	(vec_duplicate:V4FI_64 (match_dup 2)))
+   (set (match_dup 0)
+	(vec_merge:V4FI_64 (match_dup 1) (match_dup 0) (match_dup 3)))]
+  "operands[3] = GEN_INT (~INTVAL (operands[3]) & 0xf);")
 
 (define_insn "*mmx_pinsrb"
   [(set (match_operand:V8QI 0 "register_operand" "=x,YW")
@@ -4973,6 +5002,41 @@  (define_insn "*mmx_pextrw"
    (set_attr "prefix" "orig,maybe_vex,maybe_vex,maybe_evex")
    (set_attr "mode" "DI,TI,TI,TI")])
 
+(define_insn "*mmx_pextrw<mode>"
+  [(set (match_operand:<mmxscalarmode> 0 "register_sse4nonimm_operand" "=?r,?r,jm,m,x,Yw")
+	(vec_select:<mmxscalarmode>
+	  (match_operand:V4F_64 1 "register_operand" "y,YW,YW,YW,0,YW")
+	  (parallel [(match_operand:SI 2 "const_0_to_3_operand")])))]
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE)
+   && (TARGET_SSE || TARGET_3DNOW_A)"
+{
+  switch (which_alternative)
+    {
+    case 0:
+    case 1:
+     return "%vpextrw\t{%2, %1, %k0|%k0, %1, %2}";
+    case 2:
+    case 3:
+     return "%vpextrw\t{%2, %1, %0|%0, %1, %2}";
+    case 4:
+      operands[2] = GEN_INT (INTVAL (operands[2]) * 2);
+      return "psrldq\t{%2, %0|%0, %2}";
+    case 5:
+      operands[2] = GEN_INT (INTVAL (operands[2]) * 2);
+      return "vpsrldq\t{%2, %1, %0|%0, %1, %2}";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "*,sse2,sse4_noavx,avx,noavx,avx")
+   (set_attr "addr" "*,*,gpr16,*,*,*")
+   (set_attr "mmx_isa" "native,*,*,*,*,*")
+   (set_attr "type" "mmxcvt,sselog1,sselog1,sselog1,sseishft1,sseishft1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,maybe_vex,maybe_vex,maybe_evex,orig,maybe_evex")
+   (set_attr "mode" "DI,TI,TI,TI,TI,TI")])
+
 (define_insn "*mmx_pextrw_zext"
   [(set (match_operand:SWI48 0 "register_operand" "=r,r")
 	(zero_extend:SWI48
@@ -5069,18 +5133,18 @@  (define_expand "mmx_pshufw"
    && (TARGET_SSE || TARGET_3DNOW_A)"
 {
   int mask = INTVAL (operands[2]);
-  emit_insn (gen_mmx_pshufw_1 (operands[0], operands[1],
-                               GEN_INT ((mask >> 0) & 3),
-                               GEN_INT ((mask >> 2) & 3),
-                               GEN_INT ((mask >> 4) & 3),
-                               GEN_INT ((mask >> 6) & 3)));
+  emit_insn (gen_mmx_pshufwv4hi_1 (operands[0], operands[1],
+				   GEN_INT ((mask >> 0) & 3),
+				   GEN_INT ((mask >> 2) & 3),
+				   GEN_INT ((mask >> 4) & 3),
+				   GEN_INT ((mask >> 6) & 3)));
   DONE;
 })
 
-(define_insn "mmx_pshufw_1"
-  [(set (match_operand:V4HI 0 "register_operand" "=y,Yw")
-        (vec_select:V4HI
-	  (match_operand:V4HI 1 "register_mmxmem_operand" "ym,Yw")
+(define_insn "mmx_pshufw<mode>_1"
+  [(set (match_operand:V4FI_64 0 "register_operand" "=y,Yw")
+        (vec_select:V4FI_64
+	  (match_operand:V4FI_64 1 "register_mmxmem_operand" "ym,Yw")
           (parallel [(match_operand 2 "const_0_to_3_operand")
                      (match_operand 3 "const_0_to_3_operand")
                      (match_operand 4 "const_0_to_3_operand")
@@ -5134,10 +5198,10 @@  (define_insn "*mmx_pshufd_1"
    (set_attr "mode" "TI")])
 
 (define_insn "*mmx_pblendw64"
-  [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,x")
-	(vec_merge:V4HI
-	  (match_operand:V4HI 2 "register_operand" "Yr,*x,x")
-	  (match_operand:V4HI 1 "register_operand" "0,0,x")
+  [(set (match_operand:V4FI_64 0 "register_operand" "=Yr,*x,x")
+	(vec_merge:V4FI_64
+	  (match_operand:V4FI_64 2 "register_operand" "Yr,*x,x")
+	  (match_operand:V4FI_64 1 "register_operand" "0,0,x")
 	  (match_operand:SI 3 "const_0_to_15_operand")))]
   "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
   "@
@@ -5152,10 +5216,10 @@  (define_insn "*mmx_pblendw64"
    (set_attr "mode" "TI")])
 
 (define_insn "*mmx_pblendw32"
-  [(set (match_operand:V2HI 0 "register_operand" "=Yr,*x,x")
-	(vec_merge:V2HI
-	  (match_operand:V2HI 2 "register_operand" "Yr,*x,x")
-	  (match_operand:V2HI 1 "register_operand" "0,0,x")
+  [(set (match_operand:V2FI_32 0 "register_operand" "=Yr,*x,x")
+	(vec_merge:V2FI_32
+	  (match_operand:V2FI_32 2 "register_operand" "Yr,*x,x")
+	  (match_operand:V2FI_32 1 "register_operand" "0,0,x")
 	  (match_operand:SI 3 "const_0_to_7_operand")))]
   "TARGET_SSE4_1"
   "@
@@ -5212,6 +5276,16 @@  (define_insn "*vec_dupv4hi"
    (set_attr "length_immediate" "1")
    (set_attr "mode" "DI,TI")])
 
+(define_insn "*vec_dup<mode>"
+  [(set (match_operand:V4F_64 0 "register_operand" "=Yw")
+	(vec_duplicate:V4F_64
+	  (match_operand:<mmxscalarmode> 1 "register_operand" "Yw")))]
+  "TARGET_MMX_WITH_SSE"
+  "%vpshuflw\t{$0, %1, %0|%0, %1, 0}"
+  [(set_attr "isa" "sse2")
+   (set_attr "type" "sselog1")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
 
 (define_insn "*vec_dupv2si"
   [(set (match_operand:V2SI 0 "register_operand" "=y,Yv")
@@ -5405,9 +5479,9 @@  (define_expand "vec_initv2sisi"
   DONE;
 })
 
-(define_expand "vec_setv4hi"
-  [(match_operand:V4HI 0 "register_operand")
-   (match_operand:HI 1 "register_operand")
+(define_expand "vec_set<mode>"
+  [(match_operand:V4FI_64 0 "register_operand")
+   (match_operand:<mmxscalarmode> 1 "register_operand")
    (match_operand 2 "vec_setm_mmx_operand")]
   "TARGET_MMX || TARGET_MMX_WITH_SSE"
 {
@@ -5419,9 +5493,9 @@  (define_expand "vec_setv4hi"
   DONE;
 })
 
-(define_expand "vec_extractv4hihi"
-  [(match_operand:HI 0 "register_operand")
-   (match_operand:V4HI 1 "register_operand")
+(define_expand "vec_extract<mode><mmxscalarmodelower>"
+  [(match_operand:<mmxscalarmode> 0 "register_operand")
+   (match_operand:V4FI_64 1 "register_operand")
    (match_operand 2 "const_int_operand")]
   "TARGET_MMX || TARGET_MMX_WITH_SSE"
 {
@@ -5440,6 +5514,16 @@  (define_expand "vec_initv4hihi"
   DONE;
 })
 
+(define_expand "vec_init<mode><mmxscalarmodelower>"
+  [(match_operand:V4F_64 0 "register_operand")
+   (match_operand 1)]
+  "TARGET_MMX_WITH_SSE"
+{
+  ix86_expand_vector_init (TARGET_MMX_WITH_SSE, operands[0],
+			   operands[1]);
+  DONE;
+})
+
 (define_expand "vec_setv8qi"
   [(match_operand:V8QI 0 "register_operand")
    (match_operand:QI 1 "register_operand")
@@ -5476,11 +5560,11 @@  (define_expand "vec_initv8qiqi"
 })
 
 (define_insn "*pinsrw"
-  [(set (match_operand:V2HI 0 "register_operand" "=x,YW")
-        (vec_merge:V2HI
-          (vec_duplicate:V2HI
-            (match_operand:HI 2 "nonimmediate_operand" "rm,rm"))
-	  (match_operand:V2HI 1 "register_operand" "0,YW")
+  [(set (match_operand:V2FI_32 0 "register_operand" "=x,YW,&x")
+        (vec_merge:V2FI_32
+          (vec_duplicate:V2FI_32
+            (match_operand:<mmxscalarmode> 2 "nonimmediate_operand" "rm,rm,x"))
+	  (match_operand:V2FI_32 1 "register_operand" "0,YW,x")
           (match_operand:SI 3 "const_int_operand")))]
   "TARGET_SSE2
    && ((unsigned) exact_log2 (INTVAL (operands[3]))
@@ -5489,6 +5573,8 @@  (define_insn "*pinsrw"
   operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])));
   switch (which_alternative)
     {
+    case 2:
+      return "#";
     case 1:
       if (MEM_P (operands[2]))
 	return "vpinsrw\t{%3, %2, %1, %0|%0, %1, %2, %3}";
@@ -5503,11 +5589,29 @@  (define_insn "*pinsrw"
       gcc_unreachable ();
     }
 }
-  [(set_attr "isa" "noavx,avx")
+  [(set_attr "isa" "noavx,avx,sse4")
    (set_attr "type" "sselog")
    (set_attr "length_immediate" "1")
    (set_attr "mode" "TI")])
 
+;; For TARGET_SSE2, implement insert from XMM reg with PSHULFW + PBLENDW.
+(define_split
+  [(set (match_operand:V2FI_32 0 "sse_reg_operand")
+	(vec_merge:V2FI_32
+	  (vec_duplicate:V2FI_32
+	    (match_operand:<mmxscalarmode> 2 "sse_reg_operand"))
+	  (match_operand:V2FI_32 1 "sse_reg_operand")
+	  (match_operand:SI 3 "const_int_operand")))]
+  "TARGET_SSE4_1 && reload_completed
+   && ((unsigned) exact_log2 (INTVAL (operands[3]))
+       < GET_MODE_NUNITS (<MODE>mode))"
+  [(set (match_dup 0)
+	(vec_duplicate:V2FI_32 (match_dup 2)))
+   (set (match_dup 0)
+	(vec_merge:V2FI_32 (match_dup 1) (match_dup 0) (match_dup 3)))]
+  "operands[3] = GEN_INT (~INTVAL (operands[3]) & 0x3);")
+
+
 (define_insn "*pinsrb"
   [(set (match_operand:V4QI 0 "register_operand" "=x,YW")
         (vec_merge:V4QI
@@ -5561,6 +5665,39 @@  (define_insn "*pextrw"
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
+(define_insn "*pextrw<mode>"
+  [(set (match_operand:<mmxscalarmode> 0 "register_sse4nonimm_operand" "=?r,jm,m,x,Yw")
+	(vec_select:<mmxscalarmode>
+	  (match_operand:V2F_32 1 "register_operand" "YW,YW,YW,0,YW")
+	  (parallel [(match_operand:SI 2 "const_0_to_1_operand")])))]
+  "TARGET_SSE2"
+{
+  switch (which_alternative)
+    {
+    case 0:
+     return "%vpextrw\t{%2, %1, %k0|%k0, %1, %2}";
+    case 1:
+     return "pextrw\t{%2, %1, %0|%0, %1, %2}";
+    case 2:
+     return "vpextrw\t{%2, %1, %0|%0, %1, %2}";
+    case 3:
+      operands[2] = GEN_INT (INTVAL (operands[2]) * 2);
+      return "psrldq\t{%2, %0|%0, %2}";
+    case 4:
+      operands[2] = GEN_INT (INTVAL (operands[2]) * 2);
+      return "vpsrldq\t{%2, %1, %0|%0, %1, %2}";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "*,sse4_noavx,avx,noavx,avx")
+   (set_attr "addr" "*,gpr16,*,*,*")
+   (set_attr "type" "sselog1,sselog1,sselog1,sseishft1,sseishft1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "maybe_vex,orig,maybe_evex,orig,maybe_evex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*pextrw_zext"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
 	(zero_extend:SWI48
@@ -5608,9 +5745,9 @@  (define_insn "*pextrb_zext"
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
-(define_expand "vec_setv2hi"
-  [(match_operand:V2HI 0 "register_operand")
-   (match_operand:HI 1 "register_operand")
+(define_expand "vec_set<mode>"
+  [(match_operand:V2FI_32 0 "register_operand")
+   (match_operand:<mmxscalarmode> 1 "register_operand")
    (match_operand 2 "vec_setm_sse41_operand")]
   "TARGET_SSE2"
 {
@@ -5622,9 +5759,9 @@  (define_expand "vec_setv2hi"
   DONE;
 })
 
-(define_expand "vec_extractv2hihi"
-  [(match_operand:HI 0 "register_operand")
-   (match_operand:V2HI 1 "register_operand")
+(define_expand "vec_extract<mode><mmxscalarmodelower>"
+  [(match_operand:<mmxscalarmode> 0 "register_operand")
+   (match_operand:V2FI_32 1 "register_operand")
    (match_operand 2 "const_int_operand")]
   "TARGET_SSE2"
 {
@@ -5659,29 +5796,29 @@  (define_expand "vec_extractv4qiqi"
 })
 
 (define_insn_and_split "*punpckwd"
-  [(set (match_operand:V2HI 0 "register_operand" "=x,Yw")
-	(vec_select:V2HI
-	  (vec_concat:V4HI
-	    (match_operand:V2HI 1 "register_operand" "0,Yw")
-	    (match_operand:V2HI 2 "register_operand" "x,Yw"))
+  [(set (match_operand:V2FI_32 0 "register_operand" "=x,Yw")
+	(vec_select:V2FI_32
+	  (vec_concat:<mmxdoublevecmode>
+	    (match_operand:V2FI_32 1 "register_operand" "0,Yw")
+	    (match_operand:V2FI_32 2 "register_operand" "x,Yw"))
 	  (parallel [(match_operand 3 "const_0_to_3_operand")
 		     (match_operand 4 "const_0_to_3_operand")])))]
   "TARGET_SSE2"
   "#"
   "&& reload_completed"
   [(set (match_dup 5)
-	(vec_select:V8HI
+	(vec_select:<mmxxmmmode>
 	  (match_dup 5)
           (parallel [(match_dup 3) (match_dup 4)
                      (const_int 2) (const_int 3)
                      (const_int 4) (const_int 5)
                      (const_int 6) (const_int 7)])))]
 {
-  rtx dest = lowpart_subreg (V8HImode, operands[0], V2HImode);
-  rtx op1 = lowpart_subreg (V8HImode, operands[1], V2HImode);
-  rtx op2 = lowpart_subreg (V8HImode, operands[2], V2HImode);
+  rtx dest = lowpart_subreg (<mmxxmmmode>mode, operands[0], <MODE>mode);
+  rtx op1 = lowpart_subreg (<mmxxmmmode>mode, operands[1], <MODE>mode);
+  rtx op2 = lowpart_subreg (<mmxxmmmode>mode, operands[2], <MODE>mode);
 
-  emit_insn (gen_vec_interleave_lowv8hi (dest, op1, op2));
+  emit_insn (gen_vec_interleave_low<mmxxmmmodelower> (dest, op1, op2));
 
   static const int map[4] = { 0, 2, 1, 3 };
 
@@ -5699,10 +5836,10 @@  (define_insn_and_split "*punpckwd"
    (set_attr "type" "sselog")
    (set_attr "mode" "TI")])
 
-(define_insn "*pshufw_1"
-  [(set (match_operand:V2HI 0 "register_operand" "=Yw")
-        (vec_select:V2HI
-          (match_operand:V2HI 1 "register_operand" "Yw")
+(define_insn "*pshufw<mode>_1"
+  [(set (match_operand:V2FI_32 0 "register_operand" "=Yw")
+        (vec_select:V2FI_32
+          (match_operand:V2FI_32 1 "register_operand" "Yw")
           (parallel [(match_operand 2 "const_0_to_1_operand")
                      (match_operand 3 "const_0_to_1_operand")])))]
   "TARGET_SSE2"
@@ -5731,8 +5868,18 @@  (define_insn "*vec_dupv2hi"
    (set_attr "length_immediate" "1")
    (set_attr "mode" "TI")])
 
-(define_expand "vec_initv2hihi"
-  [(match_operand:V2HI 0 "register_operand")
+(define_insn "*vec_dup<mode>"
+  [(set (match_operand:V2F_32 0 "register_operand" "=Yw")
+	(vec_duplicate:V2F_32
+	  (match_operand:<mmxscalarmode> 1 "register_operand" "Yw")))]
+  "TARGET_SSE2"
+  "%vpshuflw\t{$0, %1, %0|%0, %1, 0}"
+  [(set_attr "type" "sselog1")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
+
+(define_expand "vec_init<mode><mmxscalarmodelower>"
+  [(match_operand:V2FI_32 0 "register_operand")
    (match_operand 1)]
   "TARGET_SSE2"
 {
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 33198756bb0..48a9bd99576 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -12372,9 +12372,9 @@  (define_insn_and_split "*vec_extract<mode>_0"
   "operands[1] = gen_lowpart (<ssescalarmode>mode, operands[1]);")
 
 (define_insn "*vec_extract<mode>"
-  [(set (match_operand:HFBF 0 "register_sse4nonimm_operand" "=?r,jm,m,x,v")
+  [(set (match_operand:HFBF 0 "register_sse4nonimm_operand" "=?r,jm,m,x,Yw")
 	(vec_select:HFBF
-	  (match_operand:<ssevecmode> 1 "register_operand" "v,x,v,0,v")
+	  (match_operand:<ssevecmode> 1 "register_operand" "v,x,v,0,YW")
 	  (parallel
 	    [(match_operand:SI 2 "const_0_to_7_operand")])))]
   "TARGET_SSE2"
diff --git a/gcc/testsuite/gcc.target/i386/part-vect-vec_elem-1.c b/gcc/testsuite/gcc.target/i386/part-vect-vec_elem-1.c
new file mode 100644
index 00000000000..dba98aa4810
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/part-vect-vec_elem-1.c
@@ -0,0 +1,135 @@ 
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-options "-O1 -msse4.1" } */
+/* { dg-require-effective-target sse4 } */
+
+#include "sse4_1-check.h"
+
+typedef _Float16 v4hf __attribute__((vector_size(8)));
+
+v4hf
+__attribute__((noipa))
+vector_init_dupv4hf (_Float16 a)
+{
+  return __extension__(v4hf){a, a, a, a};
+}
+
+v4hf
+__attribute__((noipa))
+vector_init_allzero (_Float16 a)
+{
+  return __extension__(v4hf){0, 0, 0, 0};
+}
+
+v4hf
+__attribute__((noipa))
+vector_init_one_nonzero (_Float16 a)
+{
+  return __extension__(v4hf){0, 0, a, 0};
+}
+
+v4hf
+__attribute__((noipa))
+vector_init_one_var (_Float16 a)
+{
+  return __extension__(v4hf){1, 2, a, 4};
+}
+
+v4hf
+__attribute__((noipa))
+vector_init_general (_Float16 a, _Float16 a1, _Float16 a2, _Float16 a3)
+{
+  return __extension__(v4hf){a3, a2, a1, a};
+}
+
+v4hf
+__attribute__((noipa))
+vec_set  (_Float16 a, v4hf b)
+{
+  b[1] = a;
+  return b;
+}
+
+v4hf
+__attribute__((noipa))
+vec_set_var  (_Float16 a, v4hf b, int c)
+{
+  b[c] = a;
+  return b;
+}
+
+_Float16
+__attribute__((noipa))
+vec_extract  (v4hf b)
+{
+  return b[2];
+}
+
+static void
+sse4_1_test ()
+{
+  typedef union {
+    _Float16 a[4];
+    v4hf x;}union64hf;
+  union64hf res, exp, src;
+
+  res.x = vector_init_dupv4hf (1.0f16);
+  for (int i = 0; i != 4; i++)
+    exp.a[i] = 1.0f16;
+  if (__builtin_memcmp (&res.a[0], &exp.a[0], 8) != 0)
+    __builtin_abort ();
+
+  res.x = vector_init_allzero (1.0f16);
+  for (int i = 0; i != 4; i++)
+    exp.a[i] = 0.0f16;
+  if (__builtin_memcmp (&res.a[0], &exp.a[0], 8) != 0)
+    __builtin_abort ();
+
+  res.x = vector_init_one_nonzero (1.0f16);
+  for (int i = 0; i != 4; i++)
+    exp.a[i] = 0.0f16;
+  exp.a[2] = 1.0f16;
+  if (__builtin_memcmp (&res.a[0], &exp.a[0], 8) != 0)
+    __builtin_abort ();
+
+  res.x = vector_init_one_var (3.0f16);
+  for (int i = 0; i != 4; i++)
+    exp.a[i] = i + 1;
+  if (__builtin_memcmp (&res.a[0], &exp.a[0], 8) != 0)
+    __builtin_abort ();
+
+  res.x = vector_init_general (4.0, 3.0f, 2.0f, 1.0);
+  for (int i = 0; i != 4; i++)
+    exp.a[i] = 1 + i;
+  if (__builtin_memcmp (&res.a[0], &exp.a[0], 8) != 0)
+    __builtin_abort ();
+
+  for (int i = 0; i != 4; i++)
+    {
+      src.a[i] = i;
+      exp.a[i] = i;
+    }
+  res.x = vec_set (3.0f, src.x);
+  exp.a[1] = 3.0f;
+  if (__builtin_memcmp (&res.a[0], &exp.a[0], 8) != 0)
+    __builtin_abort ();
+
+  for (int i = 0; i != 4; i++)
+    {
+      src.a[i] = i;
+      exp.a[i] = i;
+    }
+  res.x = vec_set_var (3.0f, src.x, 1);
+  exp.a[1] = 3.0f;
+  if (__builtin_memcmp (&res.a[0], &exp.a[0], 8) != 0)
+    __builtin_abort ();
+
+  for (int i = 0; i != 4; i++)
+    {
+      src.a[i] = i;
+      exp.a[i] = i;
+    }
+  _Float16 res_scalar = vec_extract (src.x);
+  if (res_scalar != 2.0f)
+    __builtin_abort ();
+  return ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/part-vect-vec_elem-2.c b/gcc/testsuite/gcc.target/i386/part-vect-vec_elem-2.c
new file mode 100644
index 00000000000..cc195638bff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/part-vect-vec_elem-2.c
@@ -0,0 +1,135 @@ 
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-options "-O1 -msse4.1" } */
+/* { dg-require-effective-target sse4 } */
+
+#include "sse4_1-check.h"
+
+typedef _Float16 v2hf __attribute__((vector_size(4)));
+
+v2hf
+__attribute__((noipa))
+vector_init_dupv2hf (_Float16 a)
+{
+  return __extension__(v2hf){a, a};
+}
+
+v2hf
+__attribute__((noipa))
+vector_init_allzero (_Float16 a)
+{
+  return __extension__(v2hf){0, 0};
+}
+
+v2hf
+__attribute__((noipa))
+vector_init_one_nonzero (_Float16 a)
+{
+  return __extension__(v2hf){0, a};
+}
+
+v2hf
+__attribute__((noipa))
+vector_init_one_var (_Float16 a)
+{
+  return __extension__(v2hf){1, a};
+}
+
+v2hf
+__attribute__((noipa))
+vector_init_general (_Float16 a1, _Float16 a2)
+{
+  return __extension__(v2hf){a2, a1};
+}
+
+v2hf
+__attribute__((noipa))
+vec_set  (_Float16 a, v2hf b)
+{
+  b[1] = a;
+  return b;
+}
+
+v2hf
+__attribute__((noipa))
+vec_set_var  (_Float16 a, v2hf b, int c)
+{
+  b[c] = a;
+  return b;
+}
+
+_Float16
+__attribute__((noipa))
+vec_extract  (v2hf b)
+{
+  return b[1];
+}
+
+static void
+sse4_1_test ()
+{
+  typedef union {
+    _Float16 a[2];
+    v2hf x;}union64hf;
+  union64hf res, exp, src;
+
+  res.x = vector_init_dupv2hf (1.0f16);
+  for (int i = 0; i != 2; i++)
+    exp.a[i] = 1.0f16;
+  if (__builtin_memcmp (&res.a[0], &exp.a[0], 4) != 0)
+    __builtin_abort ();
+
+  res.x = vector_init_allzero (1.0f16);
+  for (int i = 0; i != 2; i++)
+    exp.a[i] = 0.0f16;
+  if (__builtin_memcmp (&res.a[0], &exp.a[0], 4) != 0)
+    __builtin_abort ();
+
+  res.x = vector_init_one_nonzero (1.0f16);
+  for (int i = 0; i != 2; i++)
+    exp.a[i] = 0.0f16;
+  exp.a[1] = 1.0f16;
+  if (__builtin_memcmp (&res.a[0], &exp.a[0], 4) != 0)
+    __builtin_abort ();
+
+  res.x = vector_init_one_var (3.0f16);
+  exp.a[0] = 1;
+  exp.a[1] = 3;
+  if (__builtin_memcmp (&res.a[0], &exp.a[0], 4) != 0)
+    __builtin_abort ();
+
+  res.x = vector_init_general (2.0f, 1.0);
+  for (int i = 0; i != 2; i++)
+    exp.a[i] = 1 + i;
+  if (__builtin_memcmp (&res.a[0], &exp.a[0], 4) != 0)
+    __builtin_abort ();
+
+  for (int i = 0; i != 2; i++)
+    {
+      src.a[i] = i;
+      exp.a[i] = i;
+    }
+  res.x = vec_set (3.0f, src.x);
+  exp.a[1] = 3.0f;
+  if (__builtin_memcmp (&res.a[0], &exp.a[0], 4) != 0)
+    __builtin_abort ();
+
+  for (int i = 0; i != 2; i++)
+    {
+      src.a[i] = i;
+      exp.a[i] = i;
+    }
+  res.x = vec_set_var (3.0f, src.x, 1);
+  exp.a[1] = 3.0f;
+  if (__builtin_memcmp (&res.a[0], &exp.a[0], 4) != 0)
+    __builtin_abort ();
+
+  for (int i = 0; i != 2; i++)
+    {
+      src.a[i] = i;
+      exp.a[i] = i;
+    }
+  _Float16 res_scalar = vec_extract (src.x);
+  if (res_scalar != 1.0f)
+    __builtin_abort ();
+  return ;
+}