[VectorCombine] Shrink loads used in shufflevector rebroadcasts. (#153138)

Reopen #128938.

Attempt to shrink the size of vector loads where only some of the
incoming lanes are used for rebroadcasts in shufflevector instructions.

---------

Co-authored-by: Leon Clark <leoclark@amd.com>
Co-authored-by: Simon Pilgrim <llvm-dev@redking.me.uk>
This commit is contained in:
Leon Clark 2025-08-12 14:08:37 +01:00 committed by GitHub
parent b8104fa320
commit 9115bef8ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 567 additions and 36 deletions

View File

@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
// CHECK-LABEL: define dso_local spir_kernel void @foo(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
// CHECK-NEXT: ret void
//
@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@ -47,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
void kernel float3_to_double2(global float3 *a, global double2 *b) {
@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//

View File

@ -16,6 +16,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
@ -29,13 +30,16 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <numeric>
#include <optional>
#include <queue>
#include <set>
#include <tuple>
#define DEBUG_TYPE "vector-combine"
#include "llvm/Transforms/Utils/InstructionWorklist.h"
@ -137,6 +141,7 @@ private:
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
bool foldInterleaveIntrinsics(Instruction &I);
bool shrinkType(Instruction &I);
bool shrinkLoadForShuffles(Instruction &I);
void replaceValue(Value &Old, Value &New) {
LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
@ -3862,6 +3867,133 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
return true;
}
// Attempt to shrink loads that are only used by shufflevector instructions.
bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
auto *OldLoad = dyn_cast<LoadInst>(&I);
if (!OldLoad || !OldLoad->isSimple())
return false;
auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
if (!OldLoadTy)
return false;
unsigned const OldNumElements = OldLoadTy->getNumElements();
// Search all uses of load. If all uses are shufflevector instructions, and
// the second operands are all poison values, find the minimum and maximum
// indices of the vector elements referenced by all shuffle masks.
// Otherwise return `std::nullopt`.
using IndexRange = std::pair<int, int>;
auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
IndexRange OutputRange = IndexRange(OldNumElements, -1);
for (llvm::Use &Use : I.uses()) {
// Ensure all uses match the required pattern.
User *Shuffle = Use.getUser();
ArrayRef<int> Mask;
if (!match(Shuffle,
m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
return std::nullopt;
// Ignore shufflevector instructions that have no uses.
if (Shuffle->use_empty())
continue;
// Find the min and max indices used by the shufflevector instruction.
for (int Index : Mask) {
if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
OutputRange.first = std::min(Index, OutputRange.first);
OutputRange.second = std::max(Index, OutputRange.second);
}
}
}
if (OutputRange.second < OutputRange.first)
return std::nullopt;
return OutputRange;
};
// Get the range of vector elements used by shufflevector instructions.
if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
unsigned const NewNumElements = Indices->second + 1u;
// If the range of vector elements is smaller than the full load, attempt
// to create a smaller load.
if (NewNumElements < OldNumElements) {
IRBuilder Builder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());
// Calculate costs of old and new ops.
Type *ElemTy = OldLoadTy->getElementType();
FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
Value *PtrOp = OldLoad->getPointerOperand();
InstructionCost OldCost = TTI.getMemoryOpCost(
Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
OldLoad->getPointerAddressSpace(), CostKind);
InstructionCost NewCost =
TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
OldLoad->getPointerAddressSpace(), CostKind);
using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
SmallVector<UseEntry, 4u> NewUses;
unsigned const MaxIndex = NewNumElements * 2u;
for (llvm::Use &Use : I.uses()) {
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
ArrayRef<int> OldMask = Shuffle->getShuffleMask();
// Create entry for new use.
NewUses.push_back({Shuffle, OldMask});
// Validate mask indices.
for (int Index : OldMask) {
if (Index >= static_cast<int>(MaxIndex))
return false;
}
// Update costs.
OldCost +=
TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
OldLoadTy, OldMask, CostKind);
NewCost +=
TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
NewLoadTy, OldMask, CostKind);
}
LLVM_DEBUG(
dbgs() << "Found a load used only by shufflevector instructions: "
<< I << "\n OldCost: " << OldCost
<< " vs NewCost: " << NewCost << "\n");
if (OldCost < NewCost || !NewCost.isValid())
return false;
// Create new load of smaller vector.
auto *NewLoad = cast<LoadInst>(
Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
NewLoad->copyMetadata(I);
// Replace all uses.
for (UseEntry &Use : NewUses) {
ShuffleVectorInst *Shuffle = Use.first;
std::vector<int> &NewMask = Use.second;
Builder.SetInsertPoint(Shuffle);
Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
Value *NewShuffle = Builder.CreateShuffleVector(
NewLoad, PoisonValue::get(NewLoadTy), NewMask);
replaceValue(*Shuffle, *NewShuffle);
}
return true;
}
}
return false;
}
/// This is the entry point for all transforms. Pass manager differences are
/// handled in the callers of this function.
bool VectorCombine::run() {
@ -3938,6 +4070,9 @@ bool VectorCombine::run() {
MadeChange |= foldSelectShuffle(I);
MadeChange |= foldShuffleToIdentity(I);
break;
case Instruction::Load:
MadeChange |= shrinkLoadForShuffles(I);
break;
case Instruction::BitCast:
MadeChange |= foldBitcastShuffle(I);
break;

View File

@ -11,13 +11,13 @@ $getAt = comdat any
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
; SSE-LABEL: @ConvertVectors_ByRef(
; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; SSE-NEXT: ret <4 x float> [[TMP3]]
;
; AVX-LABEL: @ConvertVectors_ByRef(
; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; AVX-NEXT: ret <4 x float> [[TMP3]]
;
%2 = alloca ptr, align 8

View File

@ -252,8 +252,7 @@ define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenc
define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
@ -341,8 +340,7 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0

View File

@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
; CHECK-LABEL: @load_v2i32_v4i32_asan(
; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1

View File

@ -47,21 +47,12 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
; SSE-NEXT: ret <4 x double> [[BLEND]]
;
; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
; AVX-NEXT: ret <4 x double> [[BLEND]]
; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
; CHECK-NEXT: ret <4 x double> [[BLEND]]
;
%ld0 = load <4 x double>, ptr %p0, align 32
%ld1 = load <4 x double>, ptr %p1, align 32
@ -81,3 +72,6 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
%s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
ret <2 x float> %s2
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX: {{.*}}
; SSE: {{.*}}

View File

@ -0,0 +1,404 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes=vector-combine -S < %s | FileCheck %s
define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load volatile <4 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
%val0 = load volatile <4 x half>, ptr addrspace(1) %arg0, align 32
%val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
ret <8 x half> %val1
}
define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
%val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
%val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
ret <8 x half> %val1
}
define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
%val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
%val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
ret <8 x half> %val1
}
define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x half> [[TMP1]]
;
entry:
%val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
%val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
ret <4 x half> %val1
}
define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
%val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
%val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
ret <8 x half> %val1
}
define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x half> [[VAL3]]
;
entry:
%val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
br i1 %cond, label %then, label %else
then:
%val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
br label %finally
else:
%val2 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
br label %finally
finally:
%val3 = phi <8 x half> [ %val1, %then ], [ %val2, %else ]
ret <8 x half> %val3
}
define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <4 x half> [[VAL3]]
;
entry:
%val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
br i1 %cond, label %then, label %else
then:
%val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
br label %finally
else:
%val2 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
br label %finally
finally:
%val3 = phi <4 x half> [ %val1, %then ], [ %val2, %else ]
ret <4 x half> %val3
}
define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x half> [[VAL3]]
;
entry:
%val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
br i1 %cond, label %then, label %else
then:
%val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
br label %finally
else:
%val2 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
br label %finally
finally:
%val3 = phi <8 x half> [ %val1, %then ], [ %val2, %else ]
ret <8 x half> %val3
}
define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
%val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
%val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %val1
}
define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
%val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
%val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
ret <8 x i32> %val1
}
define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x i32> [[TMP1]]
;
entry:
%val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
%val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
ret <4 x i32> %val1
}
define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
%val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
%val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
ret <8 x i32> %val1
}
define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
%val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
br i1 %cond, label %then, label %else
then:
%val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
br label %finally
else:
%val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
br label %finally
finally:
%val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ]
ret <8 x i32> %val3
}
define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
%val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
br i1 %cond, label %then, label %else
then:
%val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
br label %finally
else:
%val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
br label %finally
finally:
%val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ]
ret <8 x i32> %val3
}
define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <4 x i32> [[VAL3]]
;
entry:
%val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
br i1 %cond, label %then, label %else
then:
%val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
br label %finally
else:
%val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
br label %finally
finally:
%val3 = phi <4 x i32> [ %val1, %then ], [ %val2, %else ]
ret <4 x i32> %val3
}
define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
%val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
br i1 %cond, label %then, label %else
then:
%val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
br label %finally
else:
%val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
br label %finally
finally:
%val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ]
ret <8 x i32> %val3
}
define <8 x i32> @shuffle_v4_v8i32_cond_r1_4(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_4(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
%val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
br i1 %cond, label %then, label %else
then:
%val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
br label %finally
else:
%val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4>
br label %finally
finally:
%val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ]
ret <8 x i32> %val3
}
define <16 x i8> @shuffle_v16_v16i8_r0_31(ptr %arg) {
; CHECK-LABEL: define <16 x i8> @shuffle_v16_v16i8_r0_31(
; CHECK-SAME: ptr [[ARG:%.*]]) {
; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[ARG]], align 1
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: ret <16 x i8> [[SHUF]]
;
%load= load <16 x i8>, ptr %arg, align 1
%shuf = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <16 x i8> %shuf
}