vect: Don't allow vect_emulated_vector_p type in vectorizable_call [PR106322]

Message ID ce00c149-345a-1fac-04e7-00ff3b731f9f@linux.ibm.com
State New, archived
Headers
Series vect: Don't allow vect_emulated_vector_p type in vectorizable_call [PR106322] |

Commit Message

Kewen.Lin Aug. 12, 2022, 9:40 a.m. UTC
  Hi,

As PR106322 shows, in some cases for some vector type whose
TYPE_MODE is a scalar integral mode instead of a vector mode,
it's possible to obtain wrong target support information when
querying with the scalar integral mode.  For example, for the
test case in PR106322, on ppc64 32bit vectorizer gets vector
type "vector(2) short unsigned int" for scalar type "short
unsigned int", its mode is SImode instead of V2HImode.  The
target support querying checks umul_highpart optab with SImode
and considers it's supported, then vectorizer further generates
.MULH IFN call for that vector type.  Unfortunately it's wrong
to use SImode support for that vector type multiply highpart
here.

This patch is to teach vectorizable_call analysis not to allow
vect_emulated_vector_p type for both vectype_in and vectype_out
as Richi suggested.

Bootstrapped and regtested on x86_64-redhat-linux,
aarch64-linux-gnu and powerpc64{,le}-linux-gnu.

Is it ok for trunk?  If it's ok, I guess we want this to be
backported?

BR,
Kewen
-----
	PR tree-optimization/106322

gcc/ChangeLog:

	* tree-vect-stmts.cc (vectorizable_call): Don't allow
	vect_emulated_vector_p type for both vectype_in and vectype_out.

gcc/testsuite/ChangeLog:

	* g++.target/i386/pr106322.C: New test.
	* g++.target/powerpc/pr106322.C: New test.
---
 gcc/testsuite/g++.target/i386/pr106322.C    | 196 ++++++++++++++++++++
 gcc/testsuite/g++.target/powerpc/pr106322.C | 195 +++++++++++++++++++
 gcc/tree-vect-stmts.cc                      |   8 +
 3 files changed, 399 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/i386/pr106322.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/pr106322.C

--
2.27.0
  

Comments

Richard Biener Aug. 12, 2022, 11:14 a.m. UTC | #1
On Fri, Aug 12, 2022 at 11:41 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> Hi,
>
> As PR106322 shows, in some cases for some vector type whose
> TYPE_MODE is a scalar integral mode instead of a vector mode,
> it's possible to obtain wrong target support information when
> querying with the scalar integral mode.  For example, for the
> test case in PR106322, on ppc64 32bit vectorizer gets vector
> type "vector(2) short unsigned int" for scalar type "short
> unsigned int", its mode is SImode instead of V2HImode.  The
> target support querying checks umul_highpart optab with SImode
> and considers it's supported, then vectorizer further generates
> .MULH IFN call for that vector type.  Unfortunately it's wrong
> to use SImode support for that vector type multiply highpart
> here.
>
> This patch is to teach vectorizable_call analysis not to allow
> vect_emulated_vector_p type for both vectype_in and vectype_out
> as Richi suggested.
>
> Bootstrapped and regtested on x86_64-redhat-linux,
> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
>
> Is it ok for trunk?

OK for trunk.

> If it's ok, I guess we want this to be
> backported?

Yes, but you just missed the RC for 12.2 so please wait until after GCC 12.2
is released and the branch is open again.  The testcase looks mightly
complicated
so fallout there might be well possible as well ;)  I suppose it wasn't possible
to craft a simple C testcase after the analysis?

Richard.

>
> BR,
> Kewen
> -----
>         PR tree-optimization/106322
>
> gcc/ChangeLog:
>
>         * tree-vect-stmts.cc (vectorizable_call): Don't allow
>         vect_emulated_vector_p type for both vectype_in and vectype_out.
>
> gcc/testsuite/ChangeLog:
>
>         * g++.target/i386/pr106322.C: New test.
>         * g++.target/powerpc/pr106322.C: New test.
> ---
>  gcc/testsuite/g++.target/i386/pr106322.C    | 196 ++++++++++++++++++++
>  gcc/testsuite/g++.target/powerpc/pr106322.C | 195 +++++++++++++++++++
>  gcc/tree-vect-stmts.cc                      |   8 +
>  3 files changed, 399 insertions(+)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr106322.C
>  create mode 100644 gcc/testsuite/g++.target/powerpc/pr106322.C
>
> diff --git a/gcc/testsuite/g++.target/i386/pr106322.C b/gcc/testsuite/g++.target/i386/pr106322.C
> new file mode 100644
> index 00000000000..3cd8d6bf225
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr106322.C
> @@ -0,0 +1,196 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ia32 } */
> +/* { dg-require-effective-target c++11 } */
> +/* { dg-options "-O2 -mtune=generic -march=i686" } */
> +
> +/* As PR106322, verify this can execute well (not abort).  */
> +
> +#include <atomic>
> +#include <cassert>
> +#include <cstdlib>
> +#include <cstring>
> +#include <limits>
> +#include <memory>
> +
> +__attribute__((noipa))
> +bool BytesEqual(const void *bytes1, const void *bytes2, const size_t size) {
> +  return memcmp(bytes1, bytes2, size) == 0;
> +}
> +
> +#define HWY_ALIGNMENT 64
> +constexpr size_t kAlignment = HWY_ALIGNMENT;
> +constexpr size_t kAlias = kAlignment * 4;
> +
> +namespace hwy {
> +namespace N_EMU128 {
> +template <typename T, size_t N = 16 / sizeof(T)> struct Vec128 {
> +  T raw[16 / sizeof(T)] = {};
> +};
> +} // namespace N_EMU128
> +} // namespace hwy
> +
> +template <typename T, size_t N>
> +static void Store(const hwy::N_EMU128::Vec128<T, N> v,
> +                  T *__restrict__ aligned) {
> +  __builtin_memcpy(aligned, v.raw, sizeof(T) * N);
> +}
> +
> +template <typename T, size_t N>
> +static hwy::N_EMU128::Vec128<T, N> Load(const T *__restrict__ aligned) {
> +  hwy::N_EMU128::Vec128<T, N> v;
> +  __builtin_memcpy(v.raw, aligned, sizeof(T) * N);
> +  return v;
> +}
> +
> +template <size_t N>
> +static hwy::N_EMU128::Vec128<uint16_t, N>
> +MulHigh(hwy::N_EMU128::Vec128<uint16_t, N> a,
> +        const hwy::N_EMU128::Vec128<uint16_t, N> b) {
> +  for (size_t i = 0; i < N; ++i) {
> +    // Cast to uint32_t first to prevent overflow. Otherwise the result of
> +    // uint16_t * uint16_t is in "int" which may overflow. In practice the
> +    // result is the same but this way it is also defined.
> +    a.raw[i] = static_cast<uint16_t>(
> +        (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
> +        16);
> +  }
> +  return a;
> +}
> +
> +#define HWY_ASSERT(condition) assert((condition))
> +#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
> +
> +#pragma pack(push, 1)
> +struct AllocationHeader {
> +  void *allocated;
> +  size_t payload_size;
> +};
> +#pragma pack(pop)
> +
> +static void FreeAlignedBytes(const void *aligned_pointer) {
> +  HWY_ASSERT(aligned_pointer != nullptr);
> +  if (aligned_pointer == nullptr)
> +    return;
> +
> +  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
> +  HWY_ASSERT(payload % kAlignment == 0);
> +  const AllocationHeader *header =
> +      reinterpret_cast<const AllocationHeader *>(payload) - 1;
> +
> +  free(header->allocated);
> +}
> +
> +class AlignedFreer {
> +public:
> +  template <typename T> void operator()(T *aligned_pointer) const {
> +    FreeAlignedBytes(aligned_pointer);
> +  }
> +};
> +
> +template <typename T>
> +using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
> +
> +static inline constexpr size_t ShiftCount(size_t n) {
> +  return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
> +}
> +
> +namespace {
> +static size_t NextAlignedOffset() {
> +  static std::atomic<uint32_t> next{0};
> +  constexpr uint32_t kGroups = kAlias / kAlignment;
> +  const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups;
> +  const size_t offset = kAlignment * group;
> +  HWY_ASSERT((offset % kAlignment == 0) && offset <= kAlias);
> +  return offset;
> +}
> +} // namespace
> +
> +static void *AllocateAlignedBytes(const size_t payload_size) {
> +  HWY_ASSERT(payload_size != 0); // likely a bug in caller
> +  if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
> +    HWY_ASSERT(false && "payload_size too large");
> +    return nullptr;
> +  }
> +
> +  size_t offset = NextAlignedOffset();
> +
> +  // What: | misalign | unused | AllocationHeader |payload
> +  // Size: |<= kAlias | offset                    |payload_size
> +  //       ^allocated.^aligned.^header............^payload
> +  // The header must immediately precede payload, which must remain aligned.
> +  // To avoid wasting space, the header resides at the end of `unused`,
> +  // which therefore cannot be empty (offset == 0).
> +  if (offset == 0) {
> +    offset = kAlignment; // = RoundUpTo(sizeof(AllocationHeader), kAlignment)
> +    static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up");
> +  }
> +
> +  const size_t allocated_size = kAlias + offset + payload_size;
> +  void *allocated = malloc(allocated_size);
> +  HWY_ASSERT(allocated != nullptr);
> +  if (allocated == nullptr)
> +    return nullptr;
> +  // Always round up even if already aligned - we already asked for kAlias
> +  // extra bytes and there's no way to give them back.
> +  uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
> +  static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
> +  static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
> +  aligned &= ~(kAlias - 1);
> +
> +  const uintptr_t payload = aligned + offset; // still aligned
> +
> +  // Stash `allocated` and payload_size inside header for FreeAlignedBytes().
> +  // The allocated_size can be reconstructed from the payload_size.
> +  AllocationHeader *header = reinterpret_cast<AllocationHeader *>(payload) - 1;
> +  header->allocated = allocated;
> +  header->payload_size = payload_size;
> +
> +  return HWY_ASSUME_ALIGNED(reinterpret_cast<void *>(payload), kAlignment);
> +}gcc-12.1.1+git287.tar.xz
> +
> +template <typename T> static T *AllocateAlignedItems(size_t items) {
> +  constexpr size_t size = sizeof(T);
> +
> +  constexpr bool is_pow2 = (size & (size - 1)) == 0;
> +  constexpr size_t bits = ShiftCount(size);
> +  static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
> +
> +  const size_t bytes = is_pow2 ? items << bits : items * size;
> +  const size_t check = is_pow2 ? bytes >> bits : bytes / size;
> +  if (check != items) {
> +    return nullptr; // overflowed
> +  }
> +  return static_cast<T *>(AllocateAlignedBytes(bytes));
> +}
> +
> +template <typename T>
> +static AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
> +  return AlignedFreeUniquePtr<T[]>(AllocateAlignedItems<T>(items),
> +                                   AlignedFreer());
> +}
> +
> +int main() {
> +  AlignedFreeUniquePtr<uint16_t[]> in_lanes = AllocateAligned<uint16_t>(2);
> +  uint16_t expected_lanes[2];
> +  in_lanes[0] = 65535;
> +  in_lanes[1] = 32767;
> +  expected_lanes[0] = 65534;
> +  expected_lanes[1] = 16383;
> +  hwy::N_EMU128::Vec128<uint16_t, 2> v = Load<uint16_t, 2>(in_lanes.get());
> +  hwy::N_EMU128::Vec128<uint16_t, 2> actual = MulHigh(v, v);
> +  {
> +    auto actual_lanes = AllocateAligned<uint16_t>(2);
> +    Store(actual, actual_lanes.get());
> +    const uint8_t *expected_array =
> +        reinterpret_cast<const uint8_t *>(expected_lanes);
> +    const uint8_t *actual_array =
> +        reinterpret_cast<const uint8_t *>(actual_lanes.get());
> +    for (size_t i = 0; i < 2; ++i) {
> +      const uint8_t *expected_ptr = expected_array + i * 2;
> +      const uint8_t *actual_ptr = actual_array + i * 2;
> +      if (!BytesEqual(expected_ptr, actual_ptr, 2)) {
> +        abort();
> +      }
> +    }
> +  }
> +}
> diff --git a/gcc/testsuite/g++.target/powerpc/pr106322.C b/gcc/testsuite/g++.target/powerpc/pr106322.C
> new file mode 100644
> index 00000000000..1de6e5e37e5
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/powerpc/pr106322.C
> @@ -0,0 +1,195 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target c++11 } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power4" } */
> +
> +/* As PR106322, verify this can execute well (not abort).  */
> +
> +#include <atomic>
> +#include <cassert>
> +#include <cstdlib>
> +#include <cstring>
> +#include <limits>
> +#include <memory>
> +
> +__attribute__((noipa))
> +bool BytesEqual(const void *bytes1, const void *bytes2, const size_t size) {
> +  return memcmp(bytes1, bytes2, size) == 0;
> +}
> +
> +#define HWY_ALIGNMENT 64
> +constexpr size_t kAlignment = HWY_ALIGNMENT;
> +constexpr size_t kAlias = kAlignment * 4;
> +
> +namespace hwy {
> +namespace N_EMU128 {
> +template <typename T, size_t N = 16 / sizeof(T)> struct Vec128 {
> +  T raw[16 / sizeof(T)] = {};
> +};
> +} // namespace N_EMU128
> +} // namespace hwy
> +
> +template <typename T, size_t N>
> +static void Store(const hwy::N_EMU128::Vec128<T, N> v,
> +                  T *__restrict__ aligned) {
> +  __builtin_memcpy(aligned, v.raw, sizeof(T) * N);
> +}
> +
> +template <typename T, size_t N>
> +static hwy::N_EMU128::Vec128<T, N> Load(const T *__restrict__ aligned) {
> +  hwy::N_EMU128::Vec128<T, N> v;
> +  __builtin_memcpy(v.raw, aligned, sizeof(T) * N);
> +  return v;
> +}
> +
> +template <size_t N>
> +static hwy::N_EMU128::Vec128<uint16_t, N>
> +MulHigh(hwy::N_EMU128::Vec128<uint16_t, N> a,
> +        const hwy::N_EMU128::Vec128<uint16_t, N> b) {
> +  for (size_t i = 0; i < N; ++i) {
> +    // Cast to uint32_t first to prevent overflow. Otherwise the result of
> +    // uint16_t * uint16_t is in "int" which may overflow. In practice the
> +    // result is the same but this way it is also defined.
> +    a.raw[i] = static_cast<uint16_t>(
> +        (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
> +        16);
> +  }
> +  return a;
> +}
> +
> +#define HWY_ASSERT(condition) assert((condition))
> +#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
> +
> +#pragma pack(push, 1)
> +struct AllocationHeader {
> +  void *allocated;
> +  size_t payload_size;
> +};
> +#pragma pack(pop)
> +
> +static void FreeAlignedBytes(const void *aligned_pointer) {
> +  HWY_ASSERT(aligned_pointer != nullptr);
> +  if (aligned_pointer == nullptr)
> +    return;
> +
> +  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
> +  HWY_ASSERT(payload % kAlignment == 0);
> +  const AllocationHeader *header =
> +      reinterpret_cast<const AllocationHeader *>(payload) - 1;
> +
> +  free(header->allocated);
> +}
> +
> +class AlignedFreer {
> +public:
> +  template <typename T> void operator()(T *aligned_pointer) const {
> +    FreeAlignedBytes(aligned_pointer);
> +  }
> +};
> +
> +template <typename T>
> +using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
> +
> +static inline constexpr size_t ShiftCount(size_t n) {
> +  return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
> +}
> +
> +namespace {
> +static size_t NextAlignedOffset() {
> +  static std::atomic<uint32_t> next{0};
> +  constexpr uint32_t kGroups = kAlias / kAlignment;
> +  const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups;
> +  const size_t offset = kAlignment * group;
> +  HWY_ASSERT((offset % kAlignment == 0) && offset <= kAlias);
> +  return offset;
> +}
> +} // namespace
> +
> +static void *AllocateAlignedBytes(const size_t payload_size) {
> +  HWY_ASSERT(payload_size != 0); // likely a bug in caller
> +  if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
> +    HWY_ASSERT(false && "payload_size too large");
> +    return nullptr;
> +  }
> +
> +  size_t offset = NextAlignedOffset();
> +
> +  // What: | misalign | unused | AllocationHeader |payload
> +  // Size: |<= kAlias | offset                    |payload_size
> +  //       ^allocated.^aligned.^header............^payload
> +  // The header must immediately precede payload, which must remain aligned.
> +  // To avoid wasting space, the header resides at the end of `unused`,
> +  // which therefore cannot be empty (offset == 0).
> +  if (offset == 0) {
> +    offset = kAlignment; // = RoundUpTo(sizeof(AllocationHeader), kAlignment)
> +    static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up");
> +  }
> +
> +  const size_t allocated_size = kAlias + offset + payload_size;
> +  void *allocated = malloc(allocated_size);
> +  HWY_ASSERT(allocated != nullptr);
> +  if (allocated == nullptr)
> +    return nullptr;
> +  // Always round up even if already aligned - we already asked for kAlias
> +  // extra bytes and there's no way to give them back.
> +  uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
> +  static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
> +  static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
> +  aligned &= ~(kAlias - 1);
> +
> +  const uintptr_t payload = aligned + offset; // still aligned
> +
> +  // Stash `allocated` and payload_size inside header for FreeAlignedBytes().
> +  // The allocated_size can be reconstructed from the payload_size.
> +  AllocationHeader *header = reinterpret_cast<AllocationHeader *>(payload) - 1;
> +  header->allocated = allocated;
> +  header->payload_size = payload_size;
> +
> +  return HWY_ASSUME_ALIGNED(reinterpret_cast<void *>(payload), kAlignment);
> +}
> +
> +template <typename T> static T *AllocateAlignedItems(size_t items) {
> +  constexpr size_t size = sizeof(T);
> +
> +  constexpr bool is_pow2 = (size & (size - 1)) == 0;
> +  constexpr size_t bits = ShiftCount(size);
> +  static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
> +
> +  const size_t bytes = is_pow2 ? items << bits : items * size;
> +  const size_t check = is_pow2 ? bytes >> bits : bytes / size;
> +  if (check != items) {
> +    return nullptr; // overflowed
> +  }
> +  return static_cast<T *>(AllocateAlignedBytes(bytes));
> +}
> +
> +template <typename T>
> +static AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
> +  return AlignedFreeUniquePtr<T[]>(AllocateAlignedItems<T>(items),
> +                                   AlignedFreer());
> +}
> +
> +int main() {
> +  AlignedFreeUniquePtr<uint16_t[]> in_lanes = AllocateAligned<uint16_t>(2);
> +  uint16_t expected_lanes[2];
> +  in_lanes[0] = 65535;
> +  in_lanes[1] = 32767;
> +  expected_lanes[0] = 65534;
> +  expected_lanes[1] = 16383;
> +  hwy::N_EMU128::Vec128<uint16_t, 2> v = Load<uint16_t, 2>(in_lanes.get());
> +  hwy::N_EMU128::Vec128<uint16_t, 2> actual = MulHigh(v, v);
> +  {
> +    auto actual_lanes = AllocateAligned<uint16_t>(2);
> +    Store(actual, actual_lanes.get());
> +    const uint8_t *expected_array =
> +        reinterpret_cast<const uint8_t *>(expected_lanes);
> +    const uint8_t *actual_array =
> +        reinterpret_cast<const uint8_t *>(actual_lanes.get());
> +    for (size_t i = 0; i < 2; ++i) {
> +      const uint8_t *expected_ptr = expected_array + i * 2;
> +      const uint8_t *actual_ptr = actual_array + i * 2;
> +      if (!BytesEqual(expected_ptr, actual_ptr, 2)) {
> +        abort();
> +      }
> +    }
> +  }
> +}
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index f582d238984..c9dab217f05 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -3423,6 +3423,14 @@ vectorizable_call (vec_info *vinfo,
>        return false;
>      }
>
> +  if (vect_emulated_vector_p (vectype_in) || vect_emulated_vector_p (vectype_out))
> +  {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "use emulated vector type for call\n");
> +      return false;
> +  }
> +
>    /* FORNOW */
>    nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
>    nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
> --
> 2.27.0
>
  
Kewen.Lin Aug. 12, 2022, 11:27 a.m. UTC | #2
on 2022/8/12 19:14, Richard Biener wrote:
> On Fri, Aug 12, 2022 at 11:41 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>
>> Hi,
>>
>> As PR106322 shows, in some cases for some vector type whose
>> TYPE_MODE is a scalar integral mode instead of a vector mode,
>> it's possible to obtain wrong target support information when
>> querying with the scalar integral mode.  For example, for the
>> test case in PR106322, on ppc64 32bit vectorizer gets vector
>> type "vector(2) short unsigned int" for scalar type "short
>> unsigned int", its mode is SImode instead of V2HImode.  The
>> target support querying checks umul_highpart optab with SImode
>> and considers it's supported, then vectorizer further generates
>> .MULH IFN call for that vector type.  Unfortunately it's wrong
>> to use SImode support for that vector type multiply highpart
>> here.
>>
>> This patch is to teach vectorizable_call analysis not to allow
>> vect_emulated_vector_p type for both vectype_in and vectype_out
>> as Richi suggested.
>>
>> Bootstrapped and regtested on x86_64-redhat-linux,
>> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
>>
>> Is it ok for trunk?
> 
> OK for trunk.
> 
>> If it's ok, I guess we want this to be
>> backported?
> 
> Yes, but you just missed the RC for 12.2 so please wait until after GCC 12.2
> is released and the branch is open again.  The testcase looks mightly
> complicated
> so fallout there might be well possible as well ;)  I suppose it wasn't possible
> to craft a simple C testcase after the analysis?

Thanks for the hints!  Let me give it a try next week and get back to you then.

BR,
Kewen
  
Kewen.Lin Aug. 15, 2022, 7:59 a.m. UTC | #3
Hi Richi,

>>
>> Yes, but you just missed the RC for 12.2 so please wait until after GCC 12.2
>> is released and the branch is open again.  The testcase looks mightly
>> complicated
>> so fallout there might be well possible as well ;)  I suppose it wasn't possible
>> to craft a simple C testcase after the analysis?
> 
> Thanks for the hints!  Let me give it a try next week and get back to you then.
> 

As you suggested, I constructed one C testcase which has been verified on both i386
and ppc64 (failed w/o the patch while passed w/ that).

Is this attached patch ok for trunk?  And also ok for all release branches after a
week or so (also after frozen time)?

BR,
Kewen
From 8b63b3025d99a38cc0400ebc8d882cbcaf8a22cc Mon Sep 17 00:00:00 2001
From: Kewen Lin <linkw@linux.ibm.com>
Date: Mon, 15 Aug 2022 01:30:48 -0500
Subject: [PATCH] vect: Don't allow vect_emulated_vector_p type in
 vectorizable_call [PR106322]

As PR106322 shows, in some cases for some vector type whose
TYPE_MODE is a scalar integral mode instead of a vector mode,
it's possible to obtain wrong target support information when
querying with the scalar integral mode.  For example, for the
test case in PR106322, on ppc64 32bit vectorizer gets vector
type "vector(2) short unsigned int" for scalar type "short
unsigned int", its mode is SImode instead of V2HImode.  The
target support querying checks umul_highpart optab with SImode
and considers it's supported, then vectorizer further generates
.MULH IFN call for that vector type.  Unfortunately it's wrong
to use SImode support for that vector type multiply highpart
here.

This patch is to teach vectorizable_call analysis not to allow
vect_emulated_vector_p type for both vectype_in and vectype_out
as Richi suggested.

	PR tree-optimization/106322

gcc/ChangeLog:

	* tree-vect-stmts.cc (vectorizable_call): Don't allow
	vect_emulated_vector_p type for both vectype_in and vectype_out.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr106322.c: New test.
	* gcc.target/powerpc/pr106322.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr106322.c    | 51 +++++++++++++++++++++
 gcc/testsuite/gcc.target/powerpc/pr106322.c | 50 ++++++++++++++++++++
 gcc/tree-vect-stmts.cc                      |  8 ++++
 3 files changed, 109 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106322.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr106322.c

diff --git a/gcc/testsuite/gcc.target/i386/pr106322.c b/gcc/testsuite/gcc.target/i386/pr106322.c
new file mode 100644
index 00000000000..31333c5fdcc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106322.c
@@ -0,0 +1,51 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ia32 } */
+/* { dg-options "-O2 -mtune=generic -march=i686" } */
+
+/* As PR106322, verify this can execute well (not abort).  */
+
+#define N 64
+typedef unsigned short int uh;
+typedef unsigned short int uw;
+uh a[N];
+uh b[N];
+uh c[N];
+uh e[N];
+
+__attribute__ ((noipa)) void
+foo ()
+{
+  for (int i = 0; i < N; i++)
+    c[i] = ((uw) b[i] * (uw) a[i]) >> 16;
+}
+
+__attribute__ ((optimize ("-O0"))) void
+init ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      a[i] = (uh) (0x7ABC - 0x5 * i);
+      b[i] = (uh) (0xEAB + 0xF * i);
+      e[i] = ((uw) b[i] * (uw) a[i]) >> 16;
+    }
+}
+
+__attribute__ ((optimize ("-O0"))) void
+check ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      if (c[i] != e[i])
+	__builtin_abort ();
+    }
+}
+
+int
+main ()
+{
+  init ();
+  foo ();
+  check ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr106322.c b/gcc/testsuite/gcc.target/powerpc/pr106322.c
new file mode 100644
index 00000000000..c05072d3416
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr106322.c
@@ -0,0 +1,50 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mdejagnu-cpu=power4" } */
+
+/* As PR106322, verify this can execute well (not abort).  */
+
+#define N 64
+typedef unsigned short int uh;
+typedef unsigned short int uw;
+uh a[N];
+uh b[N];
+uh c[N];
+uh e[N];
+
+__attribute__ ((noipa)) void
+foo ()
+{
+  for (int i = 0; i < N; i++)
+    c[i] = ((uw) b[i] * (uw) a[i]) >> 16;
+}
+
+__attribute__ ((optimize ("-O0"))) void
+init ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      a[i] = (uh) (0x7ABC - 0x5 * i);
+      b[i] = (uh) (0xEAB + 0xF * i);
+      e[i] = ((uw) b[i] * (uw) a[i]) >> 16;
+    }
+}
+
+__attribute__ ((optimize ("-O0"))) void
+check ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      if (c[i] != e[i])
+	__builtin_abort ();
+    }
+}
+
+int
+main ()
+{
+  init ();
+  foo ();
+  check ();
+
+  return 0;
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index f582d238984..c9dab217f05 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3423,6 +3423,14 @@ vectorizable_call (vec_info *vinfo,
       return false;
     }
 
+  if (vect_emulated_vector_p (vectype_in) || vect_emulated_vector_p (vectype_out))
+  {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "use emulated vector type for call\n");
+      return false;
+  }
+
   /* FORNOW */
   nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
   nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
  
Richard Biener Aug. 15, 2022, 8:09 a.m. UTC | #4
On Mon, Aug 15, 2022 at 10:00 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> Hi Richi,
>
> >>
> >> Yes, but you just missed the RC for 12.2 so please wait until after GCC 12.2
> >> is released and the branch is open again.  The testcase looks mightly
> >> complicated
> >> so fallout there might be well possible as well ;)  I suppose it wasn't possible
> >> to craft a simple C testcase after the analysis?
> >
> > Thanks for the hints!  Let me give it a try next week and get back to you then.
> >
>
> As you suggested, I constructed one C testcase which has been verified on both i386
> and ppc64 (failed w/o the patch while passed w/ that).
>
> Is this attached patch ok for trunk?  And also ok for all release branches after a
> week or so (also after frozen time)?

Yes.

Thanks,
Richard.

> BR,
> Kewen
  

Patch

diff --git a/gcc/testsuite/g++.target/i386/pr106322.C b/gcc/testsuite/g++.target/i386/pr106322.C
new file mode 100644
index 00000000000..3cd8d6bf225
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr106322.C
@@ -0,0 +1,196 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target ia32 } */
+/* { dg-require-effective-target c++11 } */
+/* { dg-options "-O2 -mtune=generic -march=i686" } */
+
+/* As PR106322, verify this can execute well (not abort).  */
+
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <memory>
+
+__attribute__((noipa))
+bool BytesEqual(const void *bytes1, const void *bytes2, const size_t size) {
+  return memcmp(bytes1, bytes2, size) == 0;
+}
+
+#define HWY_ALIGNMENT 64
+constexpr size_t kAlignment = HWY_ALIGNMENT;
+constexpr size_t kAlias = kAlignment * 4;
+
+namespace hwy {
+namespace N_EMU128 {
+template <typename T, size_t N = 16 / sizeof(T)> struct Vec128 {
+  T raw[16 / sizeof(T)] = {};
+};
+} // namespace N_EMU128
+} // namespace hwy
+
+template <typename T, size_t N>
+static void Store(const hwy::N_EMU128::Vec128<T, N> v,
+                  T *__restrict__ aligned) {
+  __builtin_memcpy(aligned, v.raw, sizeof(T) * N);
+}
+
+template <typename T, size_t N>
+static hwy::N_EMU128::Vec128<T, N> Load(const T *__restrict__ aligned) {
+  hwy::N_EMU128::Vec128<T, N> v;
+  __builtin_memcpy(v.raw, aligned, sizeof(T) * N);
+  return v;
+}
+
+template <size_t N>
+static hwy::N_EMU128::Vec128<uint16_t, N>
+MulHigh(hwy::N_EMU128::Vec128<uint16_t, N> a,
+        const hwy::N_EMU128::Vec128<uint16_t, N> b) {
+  for (size_t i = 0; i < N; ++i) {
+    // Cast to uint32_t first to prevent overflow. Otherwise the result of
+    // uint16_t * uint16_t is in "int" which may overflow. In practice the
+    // result is the same but this way it is also defined.
+    a.raw[i] = static_cast<uint16_t>(
+        (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
+        16);
+  }
+  return a;
+}
+
+#define HWY_ASSERT(condition) assert((condition))
+#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
+
+#pragma pack(push, 1)
+struct AllocationHeader {
+  void *allocated;
+  size_t payload_size;
+};
+#pragma pack(pop)
+
+static void FreeAlignedBytes(const void *aligned_pointer) {
+  HWY_ASSERT(aligned_pointer != nullptr);
+  if (aligned_pointer == nullptr)
+    return;
+
+  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
+  HWY_ASSERT(payload % kAlignment == 0);
+  const AllocationHeader *header =
+      reinterpret_cast<const AllocationHeader *>(payload) - 1;
+
+  free(header->allocated);
+}
+
+class AlignedFreer {
+public:
+  template <typename T> void operator()(T *aligned_pointer) const {
+    FreeAlignedBytes(aligned_pointer);
+  }
+};
+
+template <typename T>
+using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
+
+static inline constexpr size_t ShiftCount(size_t n) {
+  return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
+}
+
+namespace {
+static size_t NextAlignedOffset() {
+  static std::atomic<uint32_t> next{0};
+  constexpr uint32_t kGroups = kAlias / kAlignment;
+  const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups;
+  const size_t offset = kAlignment * group;
+  HWY_ASSERT((offset % kAlignment == 0) && offset <= kAlias);
+  return offset;
+}
+} // namespace
+
+static void *AllocateAlignedBytes(const size_t payload_size) {
+  HWY_ASSERT(payload_size != 0); // likely a bug in caller
+  if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
+    HWY_ASSERT(false && "payload_size too large");
+    return nullptr;
+  }
+
+  size_t offset = NextAlignedOffset();
+
+  // What: | misalign | unused | AllocationHeader |payload
+  // Size: |<= kAlias | offset                    |payload_size
+  //       ^allocated.^aligned.^header............^payload
+  // The header must immediately precede payload, which must remain aligned.
+  // To avoid wasting space, the header resides at the end of `unused`,
+  // which therefore cannot be empty (offset == 0).
+  if (offset == 0) {
+    offset = kAlignment; // = RoundUpTo(sizeof(AllocationHeader), kAlignment)
+    static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up");
+  }
+
+  const size_t allocated_size = kAlias + offset + payload_size;
+  void *allocated = malloc(allocated_size);
+  HWY_ASSERT(allocated != nullptr);
+  if (allocated == nullptr)
+    return nullptr;
+  // Always round up even if already aligned - we already asked for kAlias
+  // extra bytes and there's no way to give them back.
+  uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
+  static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
+  static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
+  aligned &= ~(kAlias - 1);
+
+  const uintptr_t payload = aligned + offset; // still aligned
+
+  // Stash `allocated` and payload_size inside header for FreeAlignedBytes().
+  // The allocated_size can be reconstructed from the payload_size.
+  AllocationHeader *header = reinterpret_cast<AllocationHeader *>(payload) - 1;
+  header->allocated = allocated;
+  header->payload_size = payload_size;
+
+  return HWY_ASSUME_ALIGNED(reinterpret_cast<void *>(payload), kAlignment);
+}
+
+template <typename T> static T *AllocateAlignedItems(size_t items) {
+  constexpr size_t size = sizeof(T);
+
+  constexpr bool is_pow2 = (size & (size - 1)) == 0;
+  constexpr size_t bits = ShiftCount(size);
+  static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
+
+  const size_t bytes = is_pow2 ? items << bits : items * size;
+  const size_t check = is_pow2 ? bytes >> bits : bytes / size;
+  if (check != items) {
+    return nullptr; // overflowed
+  }
+  return static_cast<T *>(AllocateAlignedBytes(bytes));
+}
+
+template <typename T>
+static AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
+  return AlignedFreeUniquePtr<T[]>(AllocateAlignedItems<T>(items),
+                                   AlignedFreer());
+}
+
+int main() {
+  AlignedFreeUniquePtr<uint16_t[]> in_lanes = AllocateAligned<uint16_t>(2);
+  uint16_t expected_lanes[2];
+  in_lanes[0] = 65535;
+  in_lanes[1] = 32767;
+  expected_lanes[0] = 65534;
+  expected_lanes[1] = 16383;
+  hwy::N_EMU128::Vec128<uint16_t, 2> v = Load<uint16_t, 2>(in_lanes.get());
+  hwy::N_EMU128::Vec128<uint16_t, 2> actual = MulHigh(v, v);
+  {
+    auto actual_lanes = AllocateAligned<uint16_t>(2);
+    Store(actual, actual_lanes.get());
+    const uint8_t *expected_array =
+        reinterpret_cast<const uint8_t *>(expected_lanes);
+    const uint8_t *actual_array =
+        reinterpret_cast<const uint8_t *>(actual_lanes.get());
+    for (size_t i = 0; i < 2; ++i) {
+      const uint8_t *expected_ptr = expected_array + i * 2;
+      const uint8_t *actual_ptr = actual_array + i * 2;
+      if (!BytesEqual(expected_ptr, actual_ptr, 2)) {
+        abort();
+      }
+    }
+  }
+}
diff --git a/gcc/testsuite/g++.target/powerpc/pr106322.C b/gcc/testsuite/g++.target/powerpc/pr106322.C
new file mode 100644
index 00000000000..1de6e5e37e5
--- /dev/null
+++ b/gcc/testsuite/g++.target/powerpc/pr106322.C
@@ -0,0 +1,195 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target c++11 } */
+/* { dg-options "-O2 -mdejagnu-cpu=power4" } */
+
+/* As PR106322, verify this can execute well (not abort).  */
+
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <memory>
+
+__attribute__((noipa))
+bool BytesEqual(const void *bytes1, const void *bytes2, const size_t size) {
+  return memcmp(bytes1, bytes2, size) == 0;
+}
+
+#define HWY_ALIGNMENT 64
+constexpr size_t kAlignment = HWY_ALIGNMENT;
+constexpr size_t kAlias = kAlignment * 4;
+
+namespace hwy {
+namespace N_EMU128 {
+template <typename T, size_t N = 16 / sizeof(T)> struct Vec128 {
+  T raw[16 / sizeof(T)] = {};
+};
+} // namespace N_EMU128
+} // namespace hwy
+
+template <typename T, size_t N>
+static void Store(const hwy::N_EMU128::Vec128<T, N> v,
+                  T *__restrict__ aligned) {
+  __builtin_memcpy(aligned, v.raw, sizeof(T) * N);
+}
+
+template <typename T, size_t N>
+static hwy::N_EMU128::Vec128<T, N> Load(const T *__restrict__ aligned) {
+  hwy::N_EMU128::Vec128<T, N> v;
+  __builtin_memcpy(v.raw, aligned, sizeof(T) * N);
+  return v;
+}
+
+template <size_t N>
+static hwy::N_EMU128::Vec128<uint16_t, N>
+MulHigh(hwy::N_EMU128::Vec128<uint16_t, N> a,
+        const hwy::N_EMU128::Vec128<uint16_t, N> b) {
+  for (size_t i = 0; i < N; ++i) {
+    // Cast to uint32_t first to prevent overflow. Otherwise the result of
+    // uint16_t * uint16_t is in "int" which may overflow. In practice the
+    // result is the same but this way it is also defined.
+    a.raw[i] = static_cast<uint16_t>(
+        (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
+        16);
+  }
+  return a;
+}
+
+#define HWY_ASSERT(condition) assert((condition))
+#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
+
+#pragma pack(push, 1)
+struct AllocationHeader {
+  void *allocated;
+  size_t payload_size;
+};
+#pragma pack(pop)
+
+static void FreeAlignedBytes(const void *aligned_pointer) {
+  HWY_ASSERT(aligned_pointer != nullptr);
+  if (aligned_pointer == nullptr)
+    return;
+
+  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
+  HWY_ASSERT(payload % kAlignment == 0);
+  const AllocationHeader *header =
+      reinterpret_cast<const AllocationHeader *>(payload) - 1;
+
+  free(header->allocated);
+}
+
+class AlignedFreer {
+public:
+  template <typename T> void operator()(T *aligned_pointer) const {
+    FreeAlignedBytes(aligned_pointer);
+  }
+};
+
+template <typename T>
+using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
+
+static inline constexpr size_t ShiftCount(size_t n) {
+  return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
+}
+
+namespace {
+static size_t NextAlignedOffset() {
+  static std::atomic<uint32_t> next{0};
+  constexpr uint32_t kGroups = kAlias / kAlignment;
+  const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups;
+  const size_t offset = kAlignment * group;
+  HWY_ASSERT((offset % kAlignment == 0) && offset <= kAlias);
+  return offset;
+}
+} // namespace
+
+static void *AllocateAlignedBytes(const size_t payload_size) {
+  HWY_ASSERT(payload_size != 0); // likely a bug in caller
+  if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
+    HWY_ASSERT(false && "payload_size too large");
+    return nullptr;
+  }
+
+  size_t offset = NextAlignedOffset();
+
+  // What: | misalign | unused | AllocationHeader |payload
+  // Size: |<= kAlias | offset                    |payload_size
+  //       ^allocated.^aligned.^header............^payload
+  // The header must immediately precede payload, which must remain aligned.
+  // To avoid wasting space, the header resides at the end of `unused`,
+  // which therefore cannot be empty (offset == 0).
+  if (offset == 0) {
+    offset = kAlignment; // = RoundUpTo(sizeof(AllocationHeader), kAlignment)
+    static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up");
+  }
+
+  const size_t allocated_size = kAlias + offset + payload_size;
+  void *allocated = malloc(allocated_size);
+  HWY_ASSERT(allocated != nullptr);
+  if (allocated == nullptr)
+    return nullptr;
+  // Always round up even if already aligned - we already asked for kAlias
+  // extra bytes and there's no way to give them back.
+  uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
+  static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
+  static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
+  aligned &= ~(kAlias - 1);
+
+  const uintptr_t payload = aligned + offset; // still aligned
+
+  // Stash `allocated` and payload_size inside header for FreeAlignedBytes().
+  // The allocated_size can be reconstructed from the payload_size.
+  AllocationHeader *header = reinterpret_cast<AllocationHeader *>(payload) - 1;
+  header->allocated = allocated;
+  header->payload_size = payload_size;
+
+  return HWY_ASSUME_ALIGNED(reinterpret_cast<void *>(payload), kAlignment);
+}
+
+template <typename T> static T *AllocateAlignedItems(size_t items) {
+  constexpr size_t size = sizeof(T);
+
+  constexpr bool is_pow2 = (size & (size - 1)) == 0;
+  constexpr size_t bits = ShiftCount(size);
+  static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
+
+  const size_t bytes = is_pow2 ? items << bits : items * size;
+  const size_t check = is_pow2 ? bytes >> bits : bytes / size;
+  if (check != items) {
+    return nullptr; // overflowed
+  }
+  return static_cast<T *>(AllocateAlignedBytes(bytes));
+}
+
+template <typename T>
+static AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
+  return AlignedFreeUniquePtr<T[]>(AllocateAlignedItems<T>(items),
+                                   AlignedFreer());
+}
+
+int main() {
+  AlignedFreeUniquePtr<uint16_t[]> in_lanes = AllocateAligned<uint16_t>(2);
+  uint16_t expected_lanes[2];
+  in_lanes[0] = 65535;
+  in_lanes[1] = 32767;
+  expected_lanes[0] = 65534;
+  expected_lanes[1] = 16383;
+  hwy::N_EMU128::Vec128<uint16_t, 2> v = Load<uint16_t, 2>(in_lanes.get());
+  hwy::N_EMU128::Vec128<uint16_t, 2> actual = MulHigh(v, v);
+  {
+    auto actual_lanes = AllocateAligned<uint16_t>(2);
+    Store(actual, actual_lanes.get());
+    const uint8_t *expected_array =
+        reinterpret_cast<const uint8_t *>(expected_lanes);
+    const uint8_t *actual_array =
+        reinterpret_cast<const uint8_t *>(actual_lanes.get());
+    for (size_t i = 0; i < 2; ++i) {
+      const uint8_t *expected_ptr = expected_array + i * 2;
+      const uint8_t *actual_ptr = actual_array + i * 2;
+      if (!BytesEqual(expected_ptr, actual_ptr, 2)) {
+        abort();
+      }
+    }
+  }
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index f582d238984..c9dab217f05 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3423,6 +3423,14 @@  vectorizable_call (vec_info *vinfo,
       return false;
     }

+  if (vect_emulated_vector_p (vectype_in) || vect_emulated_vector_p (vectype_out))
+  {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "use emulated vector type for call\n");
+      return false;
+  }
+
   /* FORNOW */
   nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
   nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);