[X86] Allow AVX2 per-element shift intrinsics to be used in constexpr (#154780)

This handles constant folding for the AVX2 per-element shift intrinsics, which handle out of bounds shift amounts (logical result = 0, arithmetic result = signbit splat) AVX512 intrinsics will follow in follow up patches First stage of #154287
2025-08-22 09:24:24 +01:00 · 2025-08-22 09:24:24 +01:00 · 8d7df8bba1
commit 8d7df8bba1
parent 4ab5efd48d
4 changed files with 82 additions and 67 deletions
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@ -627,11 +627,23 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
 let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
  def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
  def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
-}

-let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
  def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
  def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+
+  def psllv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def psrav8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+  def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def psllv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psrav4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
 }

 let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
@ -654,46 +666,6 @@ let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
  def maskstoreq : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, _Vector<2, long long int>)">;
 }

-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def psllv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def psllv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def psrav8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def psrav4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
-}
-
 let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
  def gatherd_pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, double const *, _Vector<4, int>, _Vector<2, double>, _Constant char)">;
 }
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@ -11669,13 +11669,24 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
  case clang::X86::BI__builtin_ia32_pmulhuw512:
  case clang::X86::BI__builtin_ia32_pmulhw128:
  case clang::X86::BI__builtin_ia32_pmulhw256:
-  case clang::X86::BI__builtin_ia32_pmulhw512: {
+  case clang::X86::BI__builtin_ia32_pmulhw512:
+  case clang::X86::BI__builtin_ia32_psllv2di:
+  case clang::X86::BI__builtin_ia32_psllv4di:
+  case clang::X86::BI__builtin_ia32_psllv4si:
+  case clang::X86::BI__builtin_ia32_psllv8si:
+  case clang::X86::BI__builtin_ia32_psrav4si:
+  case clang::X86::BI__builtin_ia32_psrav8si:
+  case clang::X86::BI__builtin_ia32_psrlv2di:
+  case clang::X86::BI__builtin_ia32_psrlv4di:
+  case clang::X86::BI__builtin_ia32_psrlv4si:
+  case clang::X86::BI__builtin_ia32_psrlv8si:{
    APValue SourceLHS, SourceRHS;
    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
      return false;

    QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+    bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
    unsigned SourceLen = SourceLHS.getVectorLength();
    SmallVector<APValue, 4> ResultElements;
    ResultElements.reserve(SourceLen);
@ -11687,12 +11698,12 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
      case Builtin::BI__builtin_elementwise_add_sat:
        ResultElements.push_back(APValue(
            APSInt(LHS.isSigned() ? LHS.sadd_sat(RHS) : LHS.uadd_sat(RHS),
-                   DestEltTy->isUnsignedIntegerOrEnumerationType())));
+                   DestUnsigned)));
        break;
      case Builtin::BI__builtin_elementwise_sub_sat:
        ResultElements.push_back(APValue(
            APSInt(LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS),
-                   DestEltTy->isUnsignedIntegerOrEnumerationType())));
+                   DestUnsigned)));
        break;
      case clang::X86::BI__builtin_ia32_pmulhuw128:
      case clang::X86::BI__builtin_ia32_pmulhuw256:
@ -11706,6 +11717,40 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
        ResultElements.push_back(APValue(APSInt(llvm::APIntOps::mulhs(LHS, RHS),
                                                /*isUnsigned=*/false)));
        break;
+      case clang::X86::BI__builtin_ia32_psllv2di:
+      case clang::X86::BI__builtin_ia32_psllv4di:
+      case clang::X86::BI__builtin_ia32_psllv4si:
+      case clang::X86::BI__builtin_ia32_psllv8si:
+        if (RHS.uge(RHS.getBitWidth())) {
+          ResultElements.push_back(
+              APValue(APSInt(APInt::getZero(RHS.getBitWidth()), DestUnsigned)));
+          break;
+        }
+        ResultElements.push_back(
+            APValue(APSInt(LHS.shl(RHS.getZExtValue()), DestUnsigned)));
+        break;
+      case clang::X86::BI__builtin_ia32_psrav4si:
+      case clang::X86::BI__builtin_ia32_psrav8si:
+        if (RHS.uge(RHS.getBitWidth())) {
+          ResultElements.push_back(
+              APValue(APSInt(LHS.ashr(RHS.getBitWidth() - 1), DestUnsigned)));
+          break;
+        }
+        ResultElements.push_back(
+            APValue(APSInt(LHS.ashr(RHS.getZExtValue()), DestUnsigned)));
+        break;
+      case clang::X86::BI__builtin_ia32_psrlv2di:
+      case clang::X86::BI__builtin_ia32_psrlv4di:
+      case clang::X86::BI__builtin_ia32_psrlv4si:
+      case clang::X86::BI__builtin_ia32_psrlv8si:
+        if (RHS.uge(RHS.getBitWidth())) {
+          ResultElements.push_back(
+              APValue(APSInt(APInt::getZero(RHS.getBitWidth()), DestUnsigned)));
+          break;
+        }
+        ResultElements.push_back(
+            APValue(APSInt(LHS.lshr(RHS.getZExtValue()), DestUnsigned)));
+        break;
      }
    }

--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@ -3721,7 +3721,7 @@ _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
@ -3743,7 +3743,7 @@ _mm256_sllv_epi32(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_sllv_epi32(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
@ -3765,7 +3765,7 @@ _mm_sllv_epi32(__m128i __X, __m128i __Y)
 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
@ -3787,7 +3787,7 @@ _mm256_sllv_epi64(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [2 x i64] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_sllv_epi64(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
@ -3810,7 +3810,7 @@ _mm_sllv_epi64(__m128i __X, __m128i __Y)
 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_srav_epi32(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
@ -3833,7 +3833,7 @@ _mm256_srav_epi32(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_srav_epi32(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
@ -3855,7 +3855,7 @@ _mm_srav_epi32(__m128i __X, __m128i __Y)
 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
@ -3877,7 +3877,7 @@ _mm256_srlv_epi32(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_srlv_epi32(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
@ -3899,7 +3899,7 @@ _mm_srlv_epi32(__m128i __X, __m128i __Y)
 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
@ -3921,7 +3921,7 @@ _mm256_srlv_epi64(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [2 x i64] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_srlv_epi64(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@ -327,7 +327,6 @@ __m256i test_mm256_cvtepi8_epi16(__m128i a) {
  // CHECK: sext <16 x i8> %{{.*}} to <16 x i16>
  return _mm256_cvtepi8_epi16(a);
 }
-
 TEST_CONSTEXPR(match_v16hi(_mm256_cvtepi8_epi16(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12));

 __m256i test_mm256_cvtepi8_epi32(__m128i a) {
@ -336,7 +335,6 @@ __m256i test_mm256_cvtepi8_epi32(__m128i a) {
  // CHECK: sext <8 x i8> %{{.*}} to <8 x i32>
  return _mm256_cvtepi8_epi32(a);
 }
-
 TEST_CONSTEXPR(match_v8si(_mm256_cvtepi8_epi32(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0, 1, -2, 3, -4));

 __m256i test_mm256_cvtepi8_epi64(__m128i a) {
@ -345,7 +343,6 @@ __m256i test_mm256_cvtepi8_epi64(__m128i a) {
  // CHECK: sext <4 x i8> %{{.*}} to <4 x i64>
  return _mm256_cvtepi8_epi64(a);
 }
-
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepi8_epi64(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0));

 __m256i test_mm256_cvtepi16_epi32(__m128i a) {
@ -353,7 +350,6 @@ __m256i test_mm256_cvtepi16_epi32(__m128i a) {
  // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
  return _mm256_cvtepi16_epi32(a);
 }
-
 TEST_CONSTEXPR(match_v8si(_mm256_cvtepi16_epi32(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), -300, 2, -1, 0, 1, -2, 3, -4));

 __m256i test_mm256_cvtepi16_epi64(__m128i a) {
@ -362,7 +358,6 @@ __m256i test_mm256_cvtepi16_epi64(__m128i a) {
  // CHECK: sext <4 x i16> %{{.*}} to <4 x i64>
  return _mm256_cvtepi16_epi64(a);
 }
-
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepi16_epi64(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), -300, 2, -1, 0));

 __m256i test_mm256_cvtepi32_epi64(__m128i a) {
@ -370,7 +365,6 @@ __m256i test_mm256_cvtepi32_epi64(__m128i a) {
  // CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
  return _mm256_cvtepi32_epi64(a);
 }
-
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepi32_epi64(_mm_setr_epi32(-70000, 2, -1, 0)), -70000, 2, -1, 0));

 __m256i test_mm256_cvtepu8_epi16(__m128i a) {
@ -378,7 +372,6 @@ __m256i test_mm256_cvtepu8_epi16(__m128i a) {
  // CHECK: zext <16 x i8> %{{.*}} to <16 x i16>
  return _mm256_cvtepu8_epi16(a);
 }
-
 TEST_CONSTEXPR(match_v16hi(_mm256_cvtepu8_epi16(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0, 1, 254, 3, 252, 5, 250, 7, 248, 9, 246, 11, 244));

 __m256i test_mm256_cvtepu8_epi32(__m128i a) {
@ -387,7 +380,6 @@ __m256i test_mm256_cvtepu8_epi32(__m128i a) {
  // CHECK: zext <8 x i8> %{{.*}} to <8 x i32>
  return _mm256_cvtepu8_epi32(a);
 }
-
 TEST_CONSTEXPR(match_v8si(_mm256_cvtepu8_epi32(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0, 1, 254, 3, 252));

 __m256i test_mm256_cvtepu8_epi64(__m128i a) {
@ -396,7 +388,6 @@ __m256i test_mm256_cvtepu8_epi64(__m128i a) {
  // CHECK: zext <4 x i8> %{{.*}} to <4 x i64>
  return _mm256_cvtepu8_epi64(a);
 }
-
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepu8_epi64(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0));

 __m256i test_mm256_cvtepu16_epi32(__m128i a) {
@ -404,7 +395,6 @@ __m256i test_mm256_cvtepu16_epi32(__m128i a) {
  // CHECK: zext <8 x i16> {{.*}} to <8 x i32>
  return _mm256_cvtepu16_epi32(a);
 }
-
 TEST_CONSTEXPR(match_v8si(_mm256_cvtepu16_epi32(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), 65236, 2, 65535, 0, 1, 65534, 3, 65532));

 __m256i test_mm256_cvtepu16_epi64(__m128i a) {
@ -413,7 +403,6 @@ __m256i test_mm256_cvtepu16_epi64(__m128i a) {
  // CHECK: zext <4 x i16> %{{.*}} to <4 x i64>
  return _mm256_cvtepu16_epi64(a);
 }
-
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepu16_epi64(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), 65236, 2, 65535, 0));

 __m256i test_mm256_cvtepu32_epi64(__m128i a) {
@ -421,7 +410,6 @@ __m256i test_mm256_cvtepu32_epi64(__m128i a) {
  // CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
  return _mm256_cvtepu32_epi64(a);
 }
-
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepu32_epi64(_mm_setr_epi32(-70000, 2, -1, 0)), 4294897296, 2, 4294967295, 0));

 __m128i test0_mm256_extracti128_si256_0(__m256i a) {
@ -1120,24 +1108,28 @@ __m128i test_mm_sllv_epi32(__m128i a, __m128i b) {
  // CHECK: call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
  return _mm_sllv_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v4si(_mm_sllv_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 2, -8, 24, 0));

 __m256i test_mm256_sllv_epi32(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_sllv_epi32
  // CHECK: call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
  return _mm256_sllv_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_sllv_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3, 4, -17, 31, 33, 29}), 2, -8, 24, -64, 0, 0, 0, 0));

 __m128i test_mm_sllv_epi64(__m128i a, __m128i b) {
  // CHECK-LABEL: test_mm_sllv_epi64
  // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
  return _mm_sllv_epi64(a, b);
 }
+TEST_CONSTEXPR(match_m128i(_mm_sllv_epi64((__m128i)(__v2di){1, -3}, (__m128i)(__v2di){8, 63}), 256, 0x8000000000000000ULL));

 __m256i test_mm256_sllv_epi64(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_sllv_epi64
  // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
  return _mm256_sllv_epi64(a, b);
 }
+TEST_CONSTEXPR(match_m256i(_mm256_sllv_epi64((__m256i)(__v4di){1, -2, 3, -4}, (__m256i)(__v4di){1, 2, 3, -4}), 2, -8, 24, 0));

 __m256i test_mm256_sra_epi16(__m256i a, __m128i b) {
  // CHECK-LABEL: test_mm256_sra_epi16
@ -1180,12 +1172,14 @@ __m128i test_mm_srav_epi32(__m128i a, __m128i b) {
  // CHECK: call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
  return _mm_srav_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v4si(_mm_srav_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 0, -1, 0, -1));

 __m256i test_mm256_srav_epi32(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_srav_epi32
  // CHECK: call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
  return _mm256_srav_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_srav_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3, 4, -17, 31, 33, 29}), 0, -1, 0, -1, 0, -1, 0, -1));

 __m256i test_mm256_srl_epi16(__m256i a, __m128i b) {
  // CHECK-LABEL: test_mm256_srl_epi16
@ -1252,24 +1246,28 @@ __m128i test_mm_srlv_epi32(__m128i a, __m128i b) {
  // CHECK: call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
  return _mm_srlv_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v4si(_mm_srlv_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 0, 1073741823, 0, 0));

 __m256i test_mm256_srlv_epi32(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_srlv_epi32
  // CHECK: call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
  return _mm256_srlv_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_srlv_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3, 4, -17, 31, 33, 29}), 0, 1073741823, 0, 268435455, 0, 1, 0, 7));

 __m128i test_mm_srlv_epi64(__m128i a, __m128i b) {
  // CHECK-LABEL: test_mm_srlv_epi64
  // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
  return _mm_srlv_epi64(a, b);
 }
+TEST_CONSTEXPR(match_m128i(_mm_srlv_epi64((__m128i)(__v2di){1, -3}, (__m128i)(__v2di){8, 63}), 0, 1));

 __m256i test_mm256_srlv_epi64(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_srlv_epi64
  // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
  return _mm256_srlv_epi64(a, b);
 }
+TEST_CONSTEXPR(match_m256i(_mm256_srlv_epi64((__m256i)(__v4di){1, -2, 3, -4}, (__m256i)(__v4di){1, 2, 3, -4}), 0, 0x3FFFFFFFFFFFFFFFULL, 0, 0));

 __m256i test_mm256_stream_load_si256(__m256i const *a) {
  // CHECK-LABEL: test_mm256_stream_load_si256