[X86] Allow AVX2 per-element shift intrinsics to be used in constexpr (#154780)

This handles constant folding for the AVX2 per-element shift intrinsics, which handle out of bounds shift amounts (logical result = 0, arithmetic result = signbit splat)

AVX512 intrinsics will follow in follow up patches

First stage of #154287
This commit is contained in:
Simon Pilgrim 2025-08-22 09:24:24 +01:00 committed by GitHub
parent 4ab5efd48d
commit 8d7df8bba1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 82 additions and 67 deletions

View File

@ -627,11 +627,23 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def psllv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
def psrav8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def psllv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
def psrav4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
}
let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
@ -654,46 +666,6 @@ let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
def maskstoreq : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, _Vector<2, long long int>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def psllv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def psllv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def psrav8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def psrav4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
}
let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
def gatherd_pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, double const *, _Vector<4, int>, _Vector<2, double>, _Constant char)">;
}

View File

@ -11669,13 +11669,24 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
case clang::X86::BI__builtin_ia32_pmulhuw512:
case clang::X86::BI__builtin_ia32_pmulhw128:
case clang::X86::BI__builtin_ia32_pmulhw256:
case clang::X86::BI__builtin_ia32_pmulhw512: {
case clang::X86::BI__builtin_ia32_pmulhw512:
case clang::X86::BI__builtin_ia32_psllv2di:
case clang::X86::BI__builtin_ia32_psllv4di:
case clang::X86::BI__builtin_ia32_psllv4si:
case clang::X86::BI__builtin_ia32_psllv8si:
case clang::X86::BI__builtin_ia32_psrav4si:
case clang::X86::BI__builtin_ia32_psrav8si:
case clang::X86::BI__builtin_ia32_psrlv2di:
case clang::X86::BI__builtin_ia32_psrlv4di:
case clang::X86::BI__builtin_ia32_psrlv4si:
case clang::X86::BI__builtin_ia32_psrlv8si:{
APValue SourceLHS, SourceRHS;
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
return false;
QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
unsigned SourceLen = SourceLHS.getVectorLength();
SmallVector<APValue, 4> ResultElements;
ResultElements.reserve(SourceLen);
@ -11687,12 +11698,12 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
case Builtin::BI__builtin_elementwise_add_sat:
ResultElements.push_back(APValue(
APSInt(LHS.isSigned() ? LHS.sadd_sat(RHS) : LHS.uadd_sat(RHS),
DestEltTy->isUnsignedIntegerOrEnumerationType())));
DestUnsigned)));
break;
case Builtin::BI__builtin_elementwise_sub_sat:
ResultElements.push_back(APValue(
APSInt(LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS),
DestEltTy->isUnsignedIntegerOrEnumerationType())));
DestUnsigned)));
break;
case clang::X86::BI__builtin_ia32_pmulhuw128:
case clang::X86::BI__builtin_ia32_pmulhuw256:
@ -11706,6 +11717,40 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
ResultElements.push_back(APValue(APSInt(llvm::APIntOps::mulhs(LHS, RHS),
/*isUnsigned=*/false)));
break;
case clang::X86::BI__builtin_ia32_psllv2di:
case clang::X86::BI__builtin_ia32_psllv4di:
case clang::X86::BI__builtin_ia32_psllv4si:
case clang::X86::BI__builtin_ia32_psllv8si:
if (RHS.uge(RHS.getBitWidth())) {
ResultElements.push_back(
APValue(APSInt(APInt::getZero(RHS.getBitWidth()), DestUnsigned)));
break;
}
ResultElements.push_back(
APValue(APSInt(LHS.shl(RHS.getZExtValue()), DestUnsigned)));
break;
case clang::X86::BI__builtin_ia32_psrav4si:
case clang::X86::BI__builtin_ia32_psrav8si:
if (RHS.uge(RHS.getBitWidth())) {
ResultElements.push_back(
APValue(APSInt(LHS.ashr(RHS.getBitWidth() - 1), DestUnsigned)));
break;
}
ResultElements.push_back(
APValue(APSInt(LHS.ashr(RHS.getZExtValue()), DestUnsigned)));
break;
case clang::X86::BI__builtin_ia32_psrlv2di:
case clang::X86::BI__builtin_ia32_psrlv4di:
case clang::X86::BI__builtin_ia32_psrlv4si:
case clang::X86::BI__builtin_ia32_psrlv8si:
if (RHS.uge(RHS.getBitWidth())) {
ResultElements.push_back(
APValue(APSInt(APInt::getZero(RHS.getBitWidth()), DestUnsigned)));
break;
}
ResultElements.push_back(
APValue(APSInt(LHS.lshr(RHS.getZExtValue()), DestUnsigned)));
break;
}
}

View File

@ -3721,7 +3721,7 @@ _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
/// bits).
/// \returns A 256-bit vector of [8 x i32] containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_sllv_epi32(__m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
@ -3743,7 +3743,7 @@ _mm256_sllv_epi32(__m256i __X, __m256i __Y)
/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
/// bits).
/// \returns A 128-bit vector of [4 x i32] containing the result.
static __inline__ __m128i __DEFAULT_FN_ATTRS128
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_sllv_epi32(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
@ -3765,7 +3765,7 @@ _mm_sllv_epi32(__m128i __X, __m128i __Y)
/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
/// bits).
/// \returns A 256-bit vector of [4 x i64] containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_sllv_epi64(__m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
@ -3787,7 +3787,7 @@ _mm256_sllv_epi64(__m256i __X, __m256i __Y)
/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
/// bits).
/// \returns A 128-bit vector of [2 x i64] containing the result.
static __inline__ __m128i __DEFAULT_FN_ATTRS128
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_sllv_epi64(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
@ -3810,7 +3810,7 @@ _mm_sllv_epi64(__m128i __X, __m128i __Y)
/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
/// bits).
/// \returns A 256-bit vector of [8 x i32] containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_srav_epi32(__m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
@ -3833,7 +3833,7 @@ _mm256_srav_epi32(__m256i __X, __m256i __Y)
/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
/// bits).
/// \returns A 128-bit vector of [4 x i32] containing the result.
static __inline__ __m128i __DEFAULT_FN_ATTRS128
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_srav_epi32(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
@ -3855,7 +3855,7 @@ _mm_srav_epi32(__m128i __X, __m128i __Y)
/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
/// bits).
/// \returns A 256-bit vector of [8 x i32] containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_srlv_epi32(__m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
@ -3877,7 +3877,7 @@ _mm256_srlv_epi32(__m256i __X, __m256i __Y)
/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
/// bits).
/// \returns A 128-bit vector of [4 x i32] containing the result.
static __inline__ __m128i __DEFAULT_FN_ATTRS128
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_srlv_epi32(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
@ -3899,7 +3899,7 @@ _mm_srlv_epi32(__m128i __X, __m128i __Y)
/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
/// bits).
/// \returns A 256-bit vector of [4 x i64] containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_srlv_epi64(__m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
@ -3921,7 +3921,7 @@ _mm256_srlv_epi64(__m256i __X, __m256i __Y)
/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
/// bits).
/// \returns A 128-bit vector of [2 x i64] containing the result.
static __inline__ __m128i __DEFAULT_FN_ATTRS128
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_srlv_epi64(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);

View File

@ -327,7 +327,6 @@ __m256i test_mm256_cvtepi8_epi16(__m128i a) {
// CHECK: sext <16 x i8> %{{.*}} to <16 x i16>
return _mm256_cvtepi8_epi16(a);
}
TEST_CONSTEXPR(match_v16hi(_mm256_cvtepi8_epi16(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12));
__m256i test_mm256_cvtepi8_epi32(__m128i a) {
@ -336,7 +335,6 @@ __m256i test_mm256_cvtepi8_epi32(__m128i a) {
// CHECK: sext <8 x i8> %{{.*}} to <8 x i32>
return _mm256_cvtepi8_epi32(a);
}
TEST_CONSTEXPR(match_v8si(_mm256_cvtepi8_epi32(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0, 1, -2, 3, -4));
__m256i test_mm256_cvtepi8_epi64(__m128i a) {
@ -345,7 +343,6 @@ __m256i test_mm256_cvtepi8_epi64(__m128i a) {
// CHECK: sext <4 x i8> %{{.*}} to <4 x i64>
return _mm256_cvtepi8_epi64(a);
}
TEST_CONSTEXPR(match_v4di(_mm256_cvtepi8_epi64(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0));
__m256i test_mm256_cvtepi16_epi32(__m128i a) {
@ -353,7 +350,6 @@ __m256i test_mm256_cvtepi16_epi32(__m128i a) {
// CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
return _mm256_cvtepi16_epi32(a);
}
TEST_CONSTEXPR(match_v8si(_mm256_cvtepi16_epi32(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), -300, 2, -1, 0, 1, -2, 3, -4));
__m256i test_mm256_cvtepi16_epi64(__m128i a) {
@ -362,7 +358,6 @@ __m256i test_mm256_cvtepi16_epi64(__m128i a) {
// CHECK: sext <4 x i16> %{{.*}} to <4 x i64>
return _mm256_cvtepi16_epi64(a);
}
TEST_CONSTEXPR(match_v4di(_mm256_cvtepi16_epi64(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), -300, 2, -1, 0));
__m256i test_mm256_cvtepi32_epi64(__m128i a) {
@ -370,7 +365,6 @@ __m256i test_mm256_cvtepi32_epi64(__m128i a) {
// CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
return _mm256_cvtepi32_epi64(a);
}
TEST_CONSTEXPR(match_v4di(_mm256_cvtepi32_epi64(_mm_setr_epi32(-70000, 2, -1, 0)), -70000, 2, -1, 0));
__m256i test_mm256_cvtepu8_epi16(__m128i a) {
@ -378,7 +372,6 @@ __m256i test_mm256_cvtepu8_epi16(__m128i a) {
// CHECK: zext <16 x i8> %{{.*}} to <16 x i16>
return _mm256_cvtepu8_epi16(a);
}
TEST_CONSTEXPR(match_v16hi(_mm256_cvtepu8_epi16(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0, 1, 254, 3, 252, 5, 250, 7, 248, 9, 246, 11, 244));
__m256i test_mm256_cvtepu8_epi32(__m128i a) {
@ -387,7 +380,6 @@ __m256i test_mm256_cvtepu8_epi32(__m128i a) {
// CHECK: zext <8 x i8> %{{.*}} to <8 x i32>
return _mm256_cvtepu8_epi32(a);
}
TEST_CONSTEXPR(match_v8si(_mm256_cvtepu8_epi32(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0, 1, 254, 3, 252));
__m256i test_mm256_cvtepu8_epi64(__m128i a) {
@ -396,7 +388,6 @@ __m256i test_mm256_cvtepu8_epi64(__m128i a) {
// CHECK: zext <4 x i8> %{{.*}} to <4 x i64>
return _mm256_cvtepu8_epi64(a);
}
TEST_CONSTEXPR(match_v4di(_mm256_cvtepu8_epi64(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0));
__m256i test_mm256_cvtepu16_epi32(__m128i a) {
@ -404,7 +395,6 @@ __m256i test_mm256_cvtepu16_epi32(__m128i a) {
// CHECK: zext <8 x i16> {{.*}} to <8 x i32>
return _mm256_cvtepu16_epi32(a);
}
TEST_CONSTEXPR(match_v8si(_mm256_cvtepu16_epi32(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), 65236, 2, 65535, 0, 1, 65534, 3, 65532));
__m256i test_mm256_cvtepu16_epi64(__m128i a) {
@ -413,7 +403,6 @@ __m256i test_mm256_cvtepu16_epi64(__m128i a) {
// CHECK: zext <4 x i16> %{{.*}} to <4 x i64>
return _mm256_cvtepu16_epi64(a);
}
TEST_CONSTEXPR(match_v4di(_mm256_cvtepu16_epi64(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), 65236, 2, 65535, 0));
__m256i test_mm256_cvtepu32_epi64(__m128i a) {
@ -421,7 +410,6 @@ __m256i test_mm256_cvtepu32_epi64(__m128i a) {
// CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
return _mm256_cvtepu32_epi64(a);
}
TEST_CONSTEXPR(match_v4di(_mm256_cvtepu32_epi64(_mm_setr_epi32(-70000, 2, -1, 0)), 4294897296, 2, 4294967295, 0));
__m128i test0_mm256_extracti128_si256_0(__m256i a) {
@ -1120,24 +1108,28 @@ __m128i test_mm_sllv_epi32(__m128i a, __m128i b) {
// CHECK: call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
return _mm_sllv_epi32(a, b);
}
TEST_CONSTEXPR(match_v4si(_mm_sllv_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 2, -8, 24, 0));
__m256i test_mm256_sllv_epi32(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_sllv_epi32
// CHECK: call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
return _mm256_sllv_epi32(a, b);
}
TEST_CONSTEXPR(match_v8si(_mm256_sllv_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3, 4, -17, 31, 33, 29}), 2, -8, 24, -64, 0, 0, 0, 0));
__m128i test_mm_sllv_epi64(__m128i a, __m128i b) {
// CHECK-LABEL: test_mm_sllv_epi64
// CHECK: call {{.*}}<2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
return _mm_sllv_epi64(a, b);
}
TEST_CONSTEXPR(match_m128i(_mm_sllv_epi64((__m128i)(__v2di){1, -3}, (__m128i)(__v2di){8, 63}), 256, 0x8000000000000000ULL));
__m256i test_mm256_sllv_epi64(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_sllv_epi64
// CHECK: call {{.*}}<4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
return _mm256_sllv_epi64(a, b);
}
TEST_CONSTEXPR(match_m256i(_mm256_sllv_epi64((__m256i)(__v4di){1, -2, 3, -4}, (__m256i)(__v4di){1, 2, 3, -4}), 2, -8, 24, 0));
__m256i test_mm256_sra_epi16(__m256i a, __m128i b) {
// CHECK-LABEL: test_mm256_sra_epi16
@ -1180,12 +1172,14 @@ __m128i test_mm_srav_epi32(__m128i a, __m128i b) {
// CHECK: call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
return _mm_srav_epi32(a, b);
}
TEST_CONSTEXPR(match_v4si(_mm_srav_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 0, -1, 0, -1));
__m256i test_mm256_srav_epi32(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_srav_epi32
// CHECK: call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
return _mm256_srav_epi32(a, b);
}
TEST_CONSTEXPR(match_v8si(_mm256_srav_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3, 4, -17, 31, 33, 29}), 0, -1, 0, -1, 0, -1, 0, -1));
__m256i test_mm256_srl_epi16(__m256i a, __m128i b) {
// CHECK-LABEL: test_mm256_srl_epi16
@ -1252,24 +1246,28 @@ __m128i test_mm_srlv_epi32(__m128i a, __m128i b) {
// CHECK: call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
return _mm_srlv_epi32(a, b);
}
TEST_CONSTEXPR(match_v4si(_mm_srlv_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 0, 1073741823, 0, 0));
__m256i test_mm256_srlv_epi32(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_srlv_epi32
// CHECK: call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
return _mm256_srlv_epi32(a, b);
}
TEST_CONSTEXPR(match_v8si(_mm256_srlv_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3, 4, -17, 31, 33, 29}), 0, 1073741823, 0, 268435455, 0, 1, 0, 7));
__m128i test_mm_srlv_epi64(__m128i a, __m128i b) {
// CHECK-LABEL: test_mm_srlv_epi64
// CHECK: call {{.*}}<2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
return _mm_srlv_epi64(a, b);
}
TEST_CONSTEXPR(match_m128i(_mm_srlv_epi64((__m128i)(__v2di){1, -3}, (__m128i)(__v2di){8, 63}), 0, 1));
__m256i test_mm256_srlv_epi64(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_srlv_epi64
// CHECK: call {{.*}}<4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
return _mm256_srlv_epi64(a, b);
}
TEST_CONSTEXPR(match_m256i(_mm256_srlv_epi64((__m256i)(__v4di){1, -2, 3, -4}, (__m256i)(__v4di){1, 2, 3, -4}), 0, 0x3FFFFFFFFFFFFFFFULL, 0, 0));
__m256i test_mm256_stream_load_si256(__m256i const *a) {
// CHECK-LABEL: test_mm256_stream_load_si256