[NVPTX] Disable v2f32 registers when no operations supported, or via cl::opt (#154476)
The addition of v2f32 as a legal type, supported by the B64 register class, has caused performance regressions, broken inline assembly, and resulted in a couple (now fixed) mis-compilations. In order to mitigate these issues, only mark this as a legal type when there exist operations that support it, since for targets where this is not the case it serves no purpose. To enable further debugging, add an option to disable v2f32. In order to allow for a target-dependent set of legal types, ComputePTXValueVTs has been fully re-written to take advantage of TargetLowering call-lowering APIs.
This commit is contained in:
parent
0319a7970d
commit
a9de1ab44d
@ -196,7 +196,8 @@ static bool IsPTXVectorType(MVT VT) {
|
||||
// - unsigned int NumElts - The number of elements in the final vector
|
||||
// - EVT EltVT - The type of the elements in the final vector
|
||||
static std::optional<std::pair<unsigned int, MVT>>
|
||||
getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) {
|
||||
getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI,
|
||||
unsigned AddressSpace) {
|
||||
if (!VectorEVT.isSimple())
|
||||
return std::nullopt;
|
||||
const MVT VectorVT = VectorEVT.getSimpleVT();
|
||||
@ -213,6 +214,8 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) {
|
||||
// The size of the PTX virtual register that holds a packed type.
|
||||
unsigned PackRegSize;
|
||||
|
||||
bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
|
||||
|
||||
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
|
||||
// legal. We can (and should) split that into 2 stores of <2 x double> here
|
||||
// but I'm leaving that as a TODO for now.
|
||||
@ -263,6 +266,8 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) {
|
||||
LLVM_FALLTHROUGH;
|
||||
case MVT::v2f32: // <1 x f32x2>
|
||||
case MVT::v4f32: // <2 x f32x2>
|
||||
if (!STI.hasF32x2Instructions())
|
||||
return std::pair(NumElts, EltVT);
|
||||
PackRegSize = 64;
|
||||
break;
|
||||
}
|
||||
@ -278,97 +283,44 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) {
|
||||
}
|
||||
|
||||
/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
|
||||
/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
|
||||
/// into their primitive components.
|
||||
/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize
|
||||
/// the types as required by the calling convention (with special handling for
|
||||
/// i8s).
|
||||
/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
|
||||
/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
|
||||
/// LowerCall, and LowerReturn.
|
||||
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
|
||||
LLVMContext &Ctx, CallingConv::ID CallConv,
|
||||
Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
|
||||
SmallVectorImpl<uint64_t> *Offsets = nullptr,
|
||||
SmallVectorImpl<uint64_t> &Offsets,
|
||||
uint64_t StartingOffset = 0) {
|
||||
SmallVector<EVT, 16> TempVTs;
|
||||
SmallVector<uint64_t, 16> TempOffsets;
|
||||
|
||||
// Special case for i128 - decompose to (i64, i64)
|
||||
if (Ty->isIntegerTy(128) || Ty->isFP128Ty()) {
|
||||
ValueVTs.append({MVT::i64, MVT::i64});
|
||||
|
||||
if (Offsets)
|
||||
Offsets->append({StartingOffset + 0, StartingOffset + 8});
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
|
||||
if (StructType *STy = dyn_cast<StructType>(Ty)) {
|
||||
auto const *SL = DL.getStructLayout(STy);
|
||||
auto ElementNum = 0;
|
||||
for(auto *EI : STy->elements()) {
|
||||
ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
|
||||
StartingOffset + SL->getElementOffset(ElementNum));
|
||||
++ElementNum;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Given an array type, recursively traverse the elements with custom ComputePTXValueVTs.
|
||||
if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
|
||||
Type *EltTy = ATy->getElementType();
|
||||
uint64_t EltSize = DL.getTypeAllocSize(EltTy);
|
||||
for (int I : llvm::seq<int>(ATy->getNumElements()))
|
||||
ComputePTXValueVTs(TLI, DL, EltTy, ValueVTs, Offsets, StartingOffset + I * EltSize);
|
||||
return;
|
||||
}
|
||||
|
||||
// Will split structs and arrays into member types, but will not split vector
|
||||
// types. We do that manually below.
|
||||
ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
|
||||
|
||||
for (auto [VT, Off] : zip(TempVTs, TempOffsets)) {
|
||||
// Split vectors into individual elements that fit into registers.
|
||||
if (VT.isVector()) {
|
||||
unsigned NumElts = VT.getVectorNumElements();
|
||||
EVT EltVT = VT.getVectorElementType();
|
||||
// Below we must maintain power-of-2 sized vectors because
|
||||
// TargetLoweringBase::getVectorTypeBreakdown() which is invoked in
|
||||
// ComputePTXValueVTs() cannot currently break down non-power-of-2 sized
|
||||
// vectors.
|
||||
for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
|
||||
MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
|
||||
unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
|
||||
|
||||
// If the element type belongs to one of the supported packed vector types
|
||||
// then we can pack multiples of this element into a single register.
|
||||
if (VT == MVT::v2i8) {
|
||||
// We can pack 2 i8s into a single 16-bit register. We only do this for
|
||||
// loads and stores, which is why we have a separate case for it.
|
||||
EltVT = MVT::v2i8;
|
||||
NumElts = 1;
|
||||
} else if (VT == MVT::v3i8) {
|
||||
// We can also pack 3 i8s into 32-bit register, leaving the 4th
|
||||
// element undefined.
|
||||
EltVT = MVT::v4i8;
|
||||
NumElts = 1;
|
||||
} else if (NumElts > 1 && isPowerOf2_32(NumElts)) {
|
||||
// Handle default packed types.
|
||||
for (MVT PackedVT : NVPTX::packed_types()) {
|
||||
const auto NumEltsPerReg = PackedVT.getVectorNumElements();
|
||||
if (NumElts % NumEltsPerReg == 0 &&
|
||||
EltVT == PackedVT.getVectorElementType()) {
|
||||
EltVT = PackedVT;
|
||||
NumElts /= NumEltsPerReg;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Since we actually can load/store b8, we need to ensure that we'll use
|
||||
// the original sized type for any i8s or i8 vectors.
|
||||
if (VT.getScalarType() == MVT::i8) {
|
||||
if (RegisterVT == MVT::i16)
|
||||
RegisterVT = MVT::i8;
|
||||
else if (RegisterVT == MVT::v2i16)
|
||||
RegisterVT = MVT::v2i8;
|
||||
else
|
||||
assert(RegisterVT == MVT::v4i8 &&
|
||||
"Expected v4i8, v2i16, or i16 for i8 RegisterVT");
|
||||
}
|
||||
|
||||
for (unsigned J : seq(NumElts)) {
|
||||
ValueVTs.push_back(EltVT);
|
||||
if (Offsets)
|
||||
Offsets->push_back(Off + J * EltVT.getStoreSize());
|
||||
}
|
||||
} else {
|
||||
ValueVTs.push_back(VT);
|
||||
if (Offsets)
|
||||
Offsets->push_back(Off);
|
||||
// TODO: This is horribly incorrect for cases where the vector elements are
|
||||
// not a multiple of bytes (ex i1) and legal or i8. However, this problem
|
||||
// has existed for as long as NVPTX has and no one has complained, so we'll
|
||||
// leave it for now.
|
||||
for (unsigned I : seq(NumRegs)) {
|
||||
ValueVTs.push_back(RegisterVT);
|
||||
Offsets.push_back(Off + I * RegisterVT.getStoreSize());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -631,7 +583,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
|
||||
addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);
|
||||
addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
|
||||
addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
|
||||
addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
|
||||
|
||||
if (STI.hasF32x2Instructions())
|
||||
addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
|
||||
|
||||
// Conversion to/from FP16/FP16x2 is always legal.
|
||||
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
|
||||
@ -672,7 +626,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Expand);
|
||||
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Expand);
|
||||
// Need custom lowering in case the index is dynamic.
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
|
||||
if (STI.hasF32x2Instructions())
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
|
||||
|
||||
// Custom conversions to/from v2i8.
|
||||
setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
|
||||
@ -1606,7 +1561,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
|
||||
} else {
|
||||
SmallVector<EVT, 16> VTs;
|
||||
SmallVector<uint64_t, 16> Offsets;
|
||||
ComputePTXValueVTs(*this, DL, Arg.Ty, VTs, &Offsets, VAOffset);
|
||||
ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets,
|
||||
VAOffset);
|
||||
assert(VTs.size() == Offsets.size() && "Size mismatch");
|
||||
assert(VTs.size() == ArgOuts.size() && "Size mismatch");
|
||||
|
||||
@ -1756,7 +1712,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
|
||||
if (!Ins.empty()) {
|
||||
SmallVector<EVT, 16> VTs;
|
||||
SmallVector<uint64_t, 16> Offsets;
|
||||
ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
|
||||
ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets);
|
||||
assert(VTs.size() == Ins.size() && "Bad value decomposition");
|
||||
|
||||
const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
|
||||
@ -3217,8 +3173,8 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
|
||||
if (ValVT != MemVT)
|
||||
return SDValue();
|
||||
|
||||
const auto NumEltsAndEltVT = getVectorLoweringShape(
|
||||
ValVT, STI.has256BitVectorLoadStore(N->getAddressSpace()));
|
||||
const auto NumEltsAndEltVT =
|
||||
getVectorLoweringShape(ValVT, STI, N->getAddressSpace());
|
||||
if (!NumEltsAndEltVT)
|
||||
return SDValue();
|
||||
const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
|
||||
@ -3386,6 +3342,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
|
||||
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
|
||||
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
|
||||
const DataLayout &DL = DAG.getDataLayout();
|
||||
LLVMContext &Ctx = *DAG.getContext();
|
||||
auto PtrVT = getPointerTy(DAG.getDataLayout());
|
||||
|
||||
const Function &F = DAG.getMachineFunction().getFunction();
|
||||
@ -3457,7 +3414,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
|
||||
} else {
|
||||
SmallVector<EVT, 16> VTs;
|
||||
SmallVector<uint64_t, 16> Offsets;
|
||||
ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
|
||||
ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets);
|
||||
assert(VTs.size() == ArgIns.size() && "Size mismatch");
|
||||
assert(VTs.size() == Offsets.size() && "Size mismatch");
|
||||
|
||||
@ -3469,7 +3426,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
|
||||
for (const unsigned NumElts : VI) {
|
||||
// i1 is loaded/stored as i8
|
||||
const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
|
||||
const EVT VecVT = getVectorizedVT(LoadVT, NumElts, *DAG.getContext());
|
||||
const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
|
||||
|
||||
SDValue VecAddr = DAG.getObjectPtrOffset(
|
||||
dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
|
||||
@ -3514,6 +3471,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
||||
}
|
||||
|
||||
const DataLayout &DL = DAG.getDataLayout();
|
||||
LLVMContext &Ctx = *DAG.getContext();
|
||||
|
||||
const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
|
||||
const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
|
||||
@ -3526,7 +3484,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
||||
|
||||
SmallVector<EVT, 16> VTs;
|
||||
SmallVector<uint64_t, 16> Offsets;
|
||||
ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
|
||||
ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets);
|
||||
assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
|
||||
|
||||
const auto GetRetVal = [&](unsigned I) -> SDValue {
|
||||
@ -5985,8 +5943,8 @@ static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
|
||||
if (ResVT != MemVT)
|
||||
return;
|
||||
|
||||
const auto NumEltsAndEltVT = getVectorLoweringShape(
|
||||
ResVT, STI.has256BitVectorLoadStore(LD->getAddressSpace()));
|
||||
const auto NumEltsAndEltVT =
|
||||
getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
|
||||
if (!NumEltsAndEltVT)
|
||||
return;
|
||||
const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
|
||||
|
||||
@ -29,6 +29,12 @@ static cl::opt<bool>
|
||||
NoF16Math("nvptx-no-f16-math", cl::Hidden,
|
||||
cl::desc("NVPTX Specific: Disable generation of f16 math ops."),
|
||||
cl::init(false));
|
||||
|
||||
static cl::opt<bool> NoF32x2("nvptx-no-f32x2", cl::Hidden,
|
||||
cl::desc("NVPTX Specific: Disable generation of "
|
||||
"f32x2 instructions and registers."),
|
||||
cl::init(false));
|
||||
|
||||
// Pin the vtable to this file.
|
||||
void NVPTXSubtarget::anchor() {}
|
||||
|
||||
@ -70,6 +76,10 @@ bool NVPTXSubtarget::allowFP16Math() const {
|
||||
return hasFP16Math() && NoF16Math == false;
|
||||
}
|
||||
|
||||
bool NVPTXSubtarget::hasF32x2Instructions() const {
|
||||
return SmVersion >= 100 && PTXVersion >= 86 && !NoF32x2;
|
||||
}
|
||||
|
||||
bool NVPTXSubtarget::hasNativeBF16Support(int Opcode) const {
|
||||
if (!hasBF16Math())
|
||||
return false;
|
||||
|
||||
@ -117,9 +117,7 @@ public:
|
||||
return HasTcgen05 && PTXVersion >= 86;
|
||||
}
|
||||
// f32x2 instructions in Blackwell family
|
||||
bool hasF32x2Instructions() const {
|
||||
return SmVersion >= 100 && PTXVersion >= 86;
|
||||
}
|
||||
bool hasF32x2Instructions() const;
|
||||
|
||||
// TMA G2S copy with cta_group::1/2 support
|
||||
bool hasCpAsyncBulkTensorCTAGroupSupport() const {
|
||||
|
||||
@ -10,19 +10,20 @@ declare {float, float} @bars({float, float} %input)
|
||||
define void @test_v2f32(<2 x float> %input, ptr %output) {
|
||||
; CHECK-LABEL: test_v2f32(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v2f32_param_0];
|
||||
; CHECK-NEXT: { // callseq 0, 0
|
||||
; CHECK-NEXT: .param .align 8 .b8 param0[8];
|
||||
; CHECK-NEXT: .param .align 8 .b8 retval0[8];
|
||||
; CHECK-NEXT: st.param.b64 [param0], %rd1;
|
||||
; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2};
|
||||
; CHECK-NEXT: call.uni (retval0), barv, (param0);
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [retval0];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [retval0];
|
||||
; CHECK-NEXT: } // callseq 0
|
||||
; CHECK-NEXT: ld.param.b64 %rd3, [test_v2f32_param_1];
|
||||
; CHECK-NEXT: st.b64 [%rd3], %rd2;
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_1];
|
||||
; CHECK-NEXT: st.v2.b32 [%rd1], {%r3, %r4};
|
||||
; CHECK-NEXT: ret;
|
||||
%call = tail call <2 x float> @barv(<2 x float> %input)
|
||||
store <2 x float> %call, ptr %output, align 8
|
||||
@ -32,24 +33,28 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
|
||||
define void @test_v3f32(<3 x float> %input, ptr %output) {
|
||||
; CHECK-LABEL: test_v3f32(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<6>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_v3f32_param_0+8];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v3f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_v3f32_param_0+8];
|
||||
; CHECK-NEXT: { // callseq 1, 0
|
||||
; CHECK-NEXT: .param .align 16 .b8 param0[16];
|
||||
; CHECK-NEXT: .param .align 16 .b8 retval0[16];
|
||||
; CHECK-NEXT: st.param.b32 [param0+8], %r1;
|
||||
; CHECK-NEXT: st.param.b64 [param0], %rd1;
|
||||
; CHECK-NEXT: st.param.b32 [param0+8], %r3;
|
||||
; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2};
|
||||
; CHECK-NEXT: call.uni (retval0), barv3, (param0);
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [retval0+8];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [retval0];
|
||||
; CHECK-NEXT: ld.param.b32 %r4, [retval0+8];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0];
|
||||
; CHECK-NEXT: } // callseq 1
|
||||
; CHECK-NEXT: ld.param.b64 %rd3, [test_v3f32_param_1];
|
||||
; CHECK-NEXT: st.b32 [%rd3+8], %r2;
|
||||
; CHECK-NEXT: st.b64 [%rd3], %rd2;
|
||||
; CHECK-NEXT: cvt.u64.u32 %rd1, %r5;
|
||||
; CHECK-NEXT: cvt.u64.u32 %rd2, %r6;
|
||||
; CHECK-NEXT: shl.b64 %rd3, %rd2, 32;
|
||||
; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3;
|
||||
; CHECK-NEXT: ld.param.b64 %rd5, [test_v3f32_param_1];
|
||||
; CHECK-NEXT: st.b32 [%rd5+8], %r4;
|
||||
; CHECK-NEXT: st.b64 [%rd5], %rd4;
|
||||
; CHECK-NEXT: ret;
|
||||
%call = tail call <3 x float> @barv3(<3 x float> %input)
|
||||
; Make sure we don't load more values than than we need to.
|
||||
|
||||
@ -688,25 +688,25 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
|
||||
; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; SM70-NEXT: cvt.u32.u16 %r5, %rs2;
|
||||
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
|
||||
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r4;
|
||||
; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
|
||||
; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r2;
|
||||
; SM70-NEXT: cvt.u32.u16 %r5, %rs8;
|
||||
; SM70-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; SM70-NEXT: cvt.u32.u16 %r7, %rs1;
|
||||
; SM70-NEXT: cvt.u32.u16 %r7, %rs7;
|
||||
; SM70-NEXT: shl.b32 %r8, %r7, 16;
|
||||
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
|
||||
; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
|
||||
; SM70-NEXT: shl.b32 %r10, %r9, 16;
|
||||
; SM70-NEXT: cvt.u32.u16 %r11, %rs3;
|
||||
; SM70-NEXT: cvt.u32.u16 %r11, %rs5;
|
||||
; SM70-NEXT: shl.b32 %r12, %r11, 16;
|
||||
; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r4;
|
||||
; SM70-NEXT: cvt.u32.u16 %r13, %rs6;
|
||||
; SM70-NEXT: cvt.u32.u16 %r13, %rs4;
|
||||
; SM70-NEXT: shl.b32 %r14, %r13, 16;
|
||||
; SM70-NEXT: cvt.u32.u16 %r15, %rs5;
|
||||
; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
|
||||
; SM70-NEXT: shl.b32 %r16, %r15, 16;
|
||||
; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r3;
|
||||
; SM70-NEXT: cvt.u32.u16 %r17, %rs8;
|
||||
; SM70-NEXT: cvt.u32.u16 %r17, %rs2;
|
||||
; SM70-NEXT: shl.b32 %r18, %r17, 16;
|
||||
; SM70-NEXT: cvt.u32.u16 %r19, %rs7;
|
||||
; SM70-NEXT: cvt.u32.u16 %r19, %rs1;
|
||||
; SM70-NEXT: shl.b32 %r20, %r19, 16;
|
||||
; SM70-NEXT: st.param.v4.b32 [func_retval0+16], {%r20, %r18, %r16, %r14};
|
||||
; SM70-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r10, %r8, %r6};
|
||||
@ -721,18 +721,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
|
||||
; SM80-NEXT: // %bb.0:
|
||||
; SM80-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
|
||||
; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r5, %rs2;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r6, %rs1;
|
||||
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r7, %rs4;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r8, %rs3;
|
||||
; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r4;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r9, %rs6;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r10, %rs5;
|
||||
; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r3;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r11, %rs8;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r12, %rs7;
|
||||
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r3;
|
||||
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r4;
|
||||
; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1;
|
||||
; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r2;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r5, %rs8;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r6, %rs7;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r7, %rs6;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r8, %rs5;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r9, %rs4;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r10, %rs3;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r11, %rs2;
|
||||
; SM80-NEXT: cvt.f32.bf16 %r12, %rs1;
|
||||
; SM80-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
|
||||
; SM80-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
|
||||
; SM80-NEXT: ret;
|
||||
@ -746,18 +746,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
|
||||
; SM80-FTZ-NEXT: // %bb.0:
|
||||
; SM80-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
|
||||
; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs1;
|
||||
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs4;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs3;
|
||||
; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r4;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs6;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs5;
|
||||
; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r3;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs8;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs7;
|
||||
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r3;
|
||||
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4;
|
||||
; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1;
|
||||
; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1;
|
||||
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
|
||||
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
|
||||
; SM80-FTZ-NEXT: ret;
|
||||
@ -771,18 +771,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
|
||||
; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r5, %rs2;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r6, %rs1;
|
||||
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r7, %rs4;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r8, %rs3;
|
||||
; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r4;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r9, %rs6;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r10, %rs5;
|
||||
; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r3;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r11, %rs8;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r12, %rs7;
|
||||
; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r3;
|
||||
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r4;
|
||||
; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r1;
|
||||
; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r2;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r5, %rs8;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r6, %rs7;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r7, %rs6;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r8, %rs5;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r9, %rs4;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r10, %rs3;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r11, %rs2;
|
||||
; SM90-NEXT: cvt.f32.bf16 %r12, %rs1;
|
||||
; SM90-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
|
||||
; SM90-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
|
||||
; SM90-NEXT: ret;
|
||||
|
||||
@ -596,18 +596,15 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
|
||||
; CHECK-F16: {
|
||||
; CHECK-F16-NEXT: .reg .pred %p<3>;
|
||||
; CHECK-F16-NEXT: .reg .b32 %r<9>;
|
||||
; CHECK-F16-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-F16-EMPTY:
|
||||
; CHECK-F16-NEXT: // %bb.0:
|
||||
; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
|
||||
; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
|
||||
; CHECK-F16-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1];
|
||||
; CHECK-F16-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0];
|
||||
; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2;
|
||||
; CHECK-F16-NEXT: mov.b64 {%r3, %r4}, %rd2;
|
||||
; CHECK-F16-NEXT: mov.b64 {%r5, %r6}, %rd1;
|
||||
; CHECK-F16-NEXT: selp.f32 %r7, %r6, %r4, %p2;
|
||||
; CHECK-F16-NEXT: selp.f32 %r8, %r5, %r3, %p1;
|
||||
; CHECK-F16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1];
|
||||
; CHECK-F16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0];
|
||||
; CHECK-F16-NEXT: ld.param.b32 %r6, [test_select_cc_f32_f16_param_3];
|
||||
; CHECK-F16-NEXT: ld.param.b32 %r5, [test_select_cc_f32_f16_param_2];
|
||||
; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r5, %r6;
|
||||
; CHECK-F16-NEXT: selp.f32 %r7, %r2, %r4, %p2;
|
||||
; CHECK-F16-NEXT: selp.f32 %r8, %r1, %r3, %p1;
|
||||
; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
|
||||
; CHECK-F16-NEXT: ret;
|
||||
;
|
||||
@ -616,25 +613,22 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
|
||||
; CHECK-NOF16-NEXT: .reg .pred %p<3>;
|
||||
; CHECK-NOF16-NEXT: .reg .b16 %rs<5>;
|
||||
; CHECK-NOF16-NEXT: .reg .b32 %r<13>;
|
||||
; CHECK-NOF16-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-NOF16-EMPTY:
|
||||
; CHECK-NOF16-NEXT: // %bb.0:
|
||||
; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
|
||||
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
|
||||
; CHECK-NOF16-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1];
|
||||
; CHECK-NOF16-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0];
|
||||
; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs1;
|
||||
; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3;
|
||||
; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r4, %r3;
|
||||
; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs2;
|
||||
; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs4;
|
||||
; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r6, %r5;
|
||||
; CHECK-NOF16-NEXT: mov.b64 {%r7, %r8}, %rd2;
|
||||
; CHECK-NOF16-NEXT: mov.b64 {%r9, %r10}, %rd1;
|
||||
; CHECK-NOF16-NEXT: selp.f32 %r11, %r10, %r8, %p2;
|
||||
; CHECK-NOF16-NEXT: selp.f32 %r12, %r9, %r7, %p1;
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1];
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0];
|
||||
; CHECK-NOF16-NEXT: ld.param.b32 %r6, [test_select_cc_f32_f16_param_3];
|
||||
; CHECK-NOF16-NEXT: ld.param.b32 %r5, [test_select_cc_f32_f16_param_2];
|
||||
; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r6;
|
||||
; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1;
|
||||
; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r5;
|
||||
; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
|
||||
; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r8, %r7;
|
||||
; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs2;
|
||||
; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs4;
|
||||
; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r10, %r9;
|
||||
; CHECK-NOF16-NEXT: selp.f32 %r11, %r2, %r4, %p2;
|
||||
; CHECK-NOF16-NEXT: selp.f32 %r12, %r1, %r3, %p1;
|
||||
; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r11};
|
||||
; CHECK-NOF16-NEXT: ret;
|
||||
<2 x half> %c, <2 x half> %d) #0 {
|
||||
@ -649,17 +643,14 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
|
||||
; CHECK-NEXT: .reg .pred %p<3>;
|
||||
; CHECK-NEXT: .reg .b16 %rs<7>;
|
||||
; CHECK-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f16_f32_param_3];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f16_f32_param_2];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_f16_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_f16_f32_param_0];
|
||||
; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2;
|
||||
; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1;
|
||||
; CHECK-NEXT: setp.neu.f32 %p1, %r5, %r3;
|
||||
; CHECK-NEXT: setp.neu.f32 %p2, %r6, %r4;
|
||||
; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r5;
|
||||
; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r6;
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; CHECK-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2;
|
||||
@ -1501,11 +1492,9 @@ define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0];
|
||||
; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0];
|
||||
; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %r2;
|
||||
; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r1;
|
||||
; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1};
|
||||
@ -1928,12 +1917,10 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
|
||||
; CHECK-F16: {
|
||||
; CHECK-F16-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-F16-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-F16-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-F16-EMPTY:
|
||||
; CHECK-F16-NEXT: // %bb.0:
|
||||
; CHECK-F16-NEXT: ld.param.b64 %rd1, [test_copysign_f32_param_1];
|
||||
; CHECK-F16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1];
|
||||
; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0];
|
||||
; CHECK-F16-NEXT: mov.b64 {%r2, %r3}, %rd1;
|
||||
; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs1, %r3;
|
||||
; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs2, %r2;
|
||||
; CHECK-F16-NEXT: mov.b32 %r4, {%rs2, %rs1};
|
||||
@ -1947,21 +1934,19 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
|
||||
; CHECK-NOF16: {
|
||||
; CHECK-NOF16-NEXT: .reg .b16 %rs<9>;
|
||||
; CHECK-NOF16-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-NOF16-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-NOF16-EMPTY:
|
||||
; CHECK-NOF16-NEXT: // %bb.0:
|
||||
; CHECK-NOF16-NEXT: ld.param.b64 %rd1, [test_copysign_f32_param_1];
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1];
|
||||
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0];
|
||||
; CHECK-NOF16-NEXT: mov.b64 {%r2, %r3}, %rd1;
|
||||
; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767;
|
||||
; CHECK-NOF16-NEXT: and.b32 %r4, %r3, -2147483648;
|
||||
; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r4; }
|
||||
; CHECK-NOF16-NEXT: or.b16 %rs5, %rs3, %rs4;
|
||||
; CHECK-NOF16-NEXT: and.b16 %rs6, %rs1, 32767;
|
||||
; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r4; }
|
||||
; CHECK-NOF16-NEXT: mov.b32 {%rs2, %rs3}, %r1;
|
||||
; CHECK-NOF16-NEXT: and.b16 %rs4, %rs3, 32767;
|
||||
; CHECK-NOF16-NEXT: or.b16 %rs5, %rs4, %rs1;
|
||||
; CHECK-NOF16-NEXT: and.b32 %r5, %r2, -2147483648;
|
||||
; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r5; }
|
||||
; CHECK-NOF16-NEXT: or.b16 %rs8, %rs6, %rs7;
|
||||
; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r5; }
|
||||
; CHECK-NOF16-NEXT: and.b16 %rs7, %rs2, 32767;
|
||||
; CHECK-NOF16-NEXT: or.b16 %rs8, %rs7, %rs6;
|
||||
; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs8, %rs5};
|
||||
; CHECK-NOF16-NEXT: ret;
|
||||
%tb = fptrunc <2 x float> %b to <2 x half>
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -7,16 +7,17 @@ declare <4 x float> @bar()
|
||||
define void @foo(ptr %ptr) {
|
||||
; CHECK-LABEL: foo(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0];
|
||||
; CHECK-NEXT: { // callseq 0, 0
|
||||
; CHECK-NEXT: .param .align 16 .b8 retval0[16];
|
||||
; CHECK-NEXT: call.uni (retval0), bar, ();
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [retval0];
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [retval0];
|
||||
; CHECK-NEXT: } // callseq 0
|
||||
; CHECK-NEXT: st.v2.b64 [%rd1], {%rd2, %rd3};
|
||||
; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4};
|
||||
; CHECK-NEXT: ret;
|
||||
%val = tail call <4 x float> @bar()
|
||||
store <4 x float> %val, ptr %ptr
|
||||
|
||||
@ -137,18 +137,32 @@ define void @generic_4xi64(ptr %a, ptr %b) {
|
||||
}
|
||||
|
||||
define void @generic_8xfloat(ptr %a, ptr %b) {
|
||||
; CHECK-LABEL: generic_8xfloat(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<7>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0];
|
||||
; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
|
||||
; CHECK-NEXT: ld.param.b64 %rd6, [generic_8xfloat_param_1];
|
||||
; CHECK-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5};
|
||||
; CHECK-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3};
|
||||
; CHECK-NEXT: ret;
|
||||
; SM90-LABEL: generic_8xfloat(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b32 %r<9>;
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0];
|
||||
; SM90-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM90-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [generic_8xfloat_param_1];
|
||||
; SM90-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
|
||||
; SM90-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: generic_8xfloat(
|
||||
; SM100: {
|
||||
; SM100-NEXT: .reg .b64 %rd<7>;
|
||||
; SM100-EMPTY:
|
||||
; SM100-NEXT: // %bb.0:
|
||||
; SM100-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0];
|
||||
; SM100-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; SM100-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
|
||||
; SM100-NEXT: ld.param.b64 %rd6, [generic_8xfloat_param_1];
|
||||
; SM100-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5};
|
||||
; SM100-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3};
|
||||
; SM100-NEXT: ret;
|
||||
%a.load = load <8 x float>, ptr %a
|
||||
store <8 x float> %a.load, ptr %b
|
||||
ret void
|
||||
@ -288,18 +302,32 @@ define void @generic_volatile_4xi64(ptr %a, ptr %b) {
|
||||
}
|
||||
|
||||
define void @generic_volatile_8xfloat(ptr %a, ptr %b) {
|
||||
; CHECK-LABEL: generic_volatile_8xfloat(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<7>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0];
|
||||
; CHECK-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; CHECK-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16];
|
||||
; CHECK-NEXT: ld.param.b64 %rd6, [generic_volatile_8xfloat_param_1];
|
||||
; CHECK-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5};
|
||||
; CHECK-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3};
|
||||
; CHECK-NEXT: ret;
|
||||
; SM90-LABEL: generic_volatile_8xfloat(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b32 %r<9>;
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0];
|
||||
; SM90-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM90-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [generic_volatile_8xfloat_param_1];
|
||||
; SM90-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
|
||||
; SM90-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: generic_volatile_8xfloat(
|
||||
; SM100: {
|
||||
; SM100-NEXT: .reg .b64 %rd<7>;
|
||||
; SM100-EMPTY:
|
||||
; SM100-NEXT: // %bb.0:
|
||||
; SM100-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0];
|
||||
; SM100-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; SM100-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16];
|
||||
; SM100-NEXT: ld.param.b64 %rd6, [generic_volatile_8xfloat_param_1];
|
||||
; SM100-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5};
|
||||
; SM100-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3};
|
||||
; SM100-NEXT: ret;
|
||||
%a.load = load volatile <8 x float>, ptr %a
|
||||
store volatile <8 x float> %a.load, ptr %b
|
||||
ret void
|
||||
@ -514,15 +542,16 @@ define void @global_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) {
|
||||
define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
|
||||
; SM90-LABEL: global_8xfloat(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b64 %rd<7>;
|
||||
; SM90-NEXT: .reg .b32 %r<9>;
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [global_8xfloat_param_0];
|
||||
; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
|
||||
; SM90-NEXT: ld.param.b64 %rd6, [global_8xfloat_param_1];
|
||||
; SM90-NEXT: st.global.v2.b64 [%rd6+16], {%rd4, %rd5};
|
||||
; SM90-NEXT: st.global.v2.b64 [%rd6], {%rd2, %rd3};
|
||||
; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [global_8xfloat_param_1];
|
||||
; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
|
||||
; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: global_8xfloat(
|
||||
@ -758,15 +787,16 @@ define void @global_volatile_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) {
|
||||
define void @global_volatile_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
|
||||
; SM90-LABEL: global_volatile_8xfloat(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b64 %rd<7>;
|
||||
; SM90-NEXT: .reg .b32 %r<9>;
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_8xfloat_param_0];
|
||||
; SM90-NEXT: ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; SM90-NEXT: ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
|
||||
; SM90-NEXT: ld.param.b64 %rd6, [global_volatile_8xfloat_param_1];
|
||||
; SM90-NEXT: st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5};
|
||||
; SM90-NEXT: st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3};
|
||||
; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_8xfloat_param_1];
|
||||
; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
|
||||
; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: global_volatile_8xfloat(
|
||||
@ -931,18 +961,32 @@ define void @shared_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) {
|
||||
}
|
||||
|
||||
define void @shared_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
|
||||
; CHECK-LABEL: shared_8xfloat(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<7>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0];
|
||||
; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; CHECK-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
|
||||
; CHECK-NEXT: ld.param.b64 %rd6, [shared_8xfloat_param_1];
|
||||
; CHECK-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
|
||||
; CHECK-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3};
|
||||
; CHECK-NEXT: ret;
|
||||
; SM90-LABEL: shared_8xfloat(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b32 %r<9>;
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0];
|
||||
; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM90-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [shared_8xfloat_param_1];
|
||||
; SM90-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
|
||||
; SM90-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: shared_8xfloat(
|
||||
; SM100: {
|
||||
; SM100-NEXT: .reg .b64 %rd<7>;
|
||||
; SM100-EMPTY:
|
||||
; SM100-NEXT: // %bb.0:
|
||||
; SM100-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0];
|
||||
; SM100-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; SM100-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
|
||||
; SM100-NEXT: ld.param.b64 %rd6, [shared_8xfloat_param_1];
|
||||
; SM100-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
|
||||
; SM100-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3};
|
||||
; SM100-NEXT: ret;
|
||||
%a.load = load <8 x float>, ptr addrspace(3) %a
|
||||
store <8 x float> %a.load, ptr addrspace(3) %b
|
||||
ret void
|
||||
@ -1082,18 +1126,32 @@ define void @shared_volatile_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) {
|
||||
}
|
||||
|
||||
define void @shared_volatile_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
|
||||
; CHECK-LABEL: shared_volatile_8xfloat(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<7>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0];
|
||||
; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
|
||||
; CHECK-NEXT: ld.param.b64 %rd6, [shared_volatile_8xfloat_param_1];
|
||||
; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
|
||||
; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3};
|
||||
; CHECK-NEXT: ret;
|
||||
; SM90-LABEL: shared_volatile_8xfloat(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b32 %r<9>;
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0];
|
||||
; SM90-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM90-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [shared_volatile_8xfloat_param_1];
|
||||
; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
|
||||
; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: shared_volatile_8xfloat(
|
||||
; SM100: {
|
||||
; SM100-NEXT: .reg .b64 %rd<7>;
|
||||
; SM100-EMPTY:
|
||||
; SM100-NEXT: // %bb.0:
|
||||
; SM100-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0];
|
||||
; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
|
||||
; SM100-NEXT: ld.param.b64 %rd6, [shared_volatile_8xfloat_param_1];
|
||||
; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
|
||||
; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3};
|
||||
; SM100-NEXT: ret;
|
||||
%a.load = load volatile <8 x float>, ptr addrspace(3) %a
|
||||
store volatile <8 x float> %a.load, ptr addrspace(3) %b
|
||||
ret void
|
||||
@ -1235,18 +1293,32 @@ define void @local_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) {
|
||||
}
|
||||
|
||||
define void @local_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
|
||||
; CHECK-LABEL: local_8xfloat(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<7>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0];
|
||||
; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
|
||||
; CHECK-NEXT: ld.param.b64 %rd6, [local_8xfloat_param_1];
|
||||
; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
|
||||
; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
|
||||
; CHECK-NEXT: ret;
|
||||
; SM90-LABEL: local_8xfloat(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b32 %r<9>;
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0];
|
||||
; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [local_8xfloat_param_1];
|
||||
; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
|
||||
; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: local_8xfloat(
|
||||
; SM100: {
|
||||
; SM100-NEXT: .reg .b64 %rd<7>;
|
||||
; SM100-EMPTY:
|
||||
; SM100-NEXT: // %bb.0:
|
||||
; SM100-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0];
|
||||
; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
|
||||
; SM100-NEXT: ld.param.b64 %rd6, [local_8xfloat_param_1];
|
||||
; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
|
||||
; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
|
||||
; SM100-NEXT: ret;
|
||||
%a.load = load <8 x float>, ptr addrspace(5) %a
|
||||
store <8 x float> %a.load, ptr addrspace(5) %b
|
||||
ret void
|
||||
@ -1386,18 +1458,32 @@ define void @local_volatile_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) {
|
||||
}
|
||||
|
||||
define void @local_volatile_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
|
||||
; CHECK-LABEL: local_volatile_8xfloat(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<7>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0];
|
||||
; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
|
||||
; CHECK-NEXT: ld.param.b64 %rd6, [local_volatile_8xfloat_param_1];
|
||||
; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
|
||||
; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
|
||||
; CHECK-NEXT: ret;
|
||||
; SM90-LABEL: local_volatile_8xfloat(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b32 %r<9>;
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0];
|
||||
; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [local_volatile_8xfloat_param_1];
|
||||
; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
|
||||
; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: local_volatile_8xfloat(
|
||||
; SM100: {
|
||||
; SM100-NEXT: .reg .b64 %rd<7>;
|
||||
; SM100-EMPTY:
|
||||
; SM100-NEXT: // %bb.0:
|
||||
; SM100-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0];
|
||||
; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
|
||||
; SM100-NEXT: ld.param.b64 %rd6, [local_volatile_8xfloat_param_1];
|
||||
; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
|
||||
; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
|
||||
; SM100-NEXT: ret;
|
||||
%a.load = load volatile <8 x float>, ptr addrspace(5) %a
|
||||
store volatile <8 x float> %a.load, ptr addrspace(5) %b
|
||||
ret void
|
||||
|
||||
@ -333,28 +333,30 @@ define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) {
|
||||
define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) {
|
||||
; SM20-LABEL: foo11(
|
||||
; SM20: {
|
||||
; SM20-NEXT: .reg .b64 %rd<6>;
|
||||
; SM20-NEXT: .reg .b32 %r<3>;
|
||||
; SM20-NEXT: .reg .b64 %rd<5>;
|
||||
; SM20-EMPTY:
|
||||
; SM20-NEXT: // %bb.0:
|
||||
; SM20-NEXT: ld.param.b64 %rd1, [foo11_param_0];
|
||||
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
|
||||
; SM20-NEXT: ld.param.b64 %rd3, [foo11_param_1];
|
||||
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
|
||||
; SM20-NEXT: ld.global.b64 %rd5, [%rd2];
|
||||
; SM20-NEXT: st.global.b64 [%rd4], %rd5;
|
||||
; SM20-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd2];
|
||||
; SM20-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2};
|
||||
; SM20-NEXT: ret;
|
||||
;
|
||||
; SM35-LABEL: foo11(
|
||||
; SM35: {
|
||||
; SM35-NEXT: .reg .b64 %rd<6>;
|
||||
; SM35-NEXT: .reg .b32 %r<3>;
|
||||
; SM35-NEXT: .reg .b64 %rd<5>;
|
||||
; SM35-EMPTY:
|
||||
; SM35-NEXT: // %bb.0:
|
||||
; SM35-NEXT: ld.param.b64 %rd1, [foo11_param_0];
|
||||
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
|
||||
; SM35-NEXT: ld.param.b64 %rd3, [foo11_param_1];
|
||||
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
|
||||
; SM35-NEXT: ld.global.nc.b64 %rd5, [%rd2];
|
||||
; SM35-NEXT: st.global.b64 [%rd4], %rd5;
|
||||
; SM35-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd2];
|
||||
; SM35-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2};
|
||||
; SM35-NEXT: ret;
|
||||
%1 = load <2 x float>, ptr %from
|
||||
store <2 x float> %1, ptr %to
|
||||
@ -494,28 +496,30 @@ define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) {
|
||||
define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) {
|
||||
; SM20-LABEL: foo16(
|
||||
; SM20: {
|
||||
; SM20-NEXT: .reg .b64 %rd<7>;
|
||||
; SM20-NEXT: .reg .b32 %r<5>;
|
||||
; SM20-NEXT: .reg .b64 %rd<5>;
|
||||
; SM20-EMPTY:
|
||||
; SM20-NEXT: // %bb.0:
|
||||
; SM20-NEXT: ld.param.b64 %rd1, [foo16_param_0];
|
||||
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
|
||||
; SM20-NEXT: ld.param.b64 %rd3, [foo16_param_1];
|
||||
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
|
||||
; SM20-NEXT: ld.global.v2.b64 {%rd5, %rd6}, [%rd2];
|
||||
; SM20-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6};
|
||||
; SM20-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
|
||||
; SM20-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4};
|
||||
; SM20-NEXT: ret;
|
||||
;
|
||||
; SM35-LABEL: foo16(
|
||||
; SM35: {
|
||||
; SM35-NEXT: .reg .b64 %rd<7>;
|
||||
; SM35-NEXT: .reg .b32 %r<5>;
|
||||
; SM35-NEXT: .reg .b64 %rd<5>;
|
||||
; SM35-EMPTY:
|
||||
; SM35-NEXT: // %bb.0:
|
||||
; SM35-NEXT: ld.param.b64 %rd1, [foo16_param_0];
|
||||
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
|
||||
; SM35-NEXT: ld.param.b64 %rd3, [foo16_param_1];
|
||||
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
|
||||
; SM35-NEXT: ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2];
|
||||
; SM35-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6};
|
||||
; SM35-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
|
||||
; SM35-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4};
|
||||
; SM35-NEXT: ret;
|
||||
%1 = load <4 x float>, ptr %from
|
||||
store <4 x float> %1, ptr %to
|
||||
|
||||
@ -8,55 +8,52 @@ target triple = "nvptx64-nvidia-cuda"
|
||||
define <4 x float> @t1(ptr %p1) {
|
||||
; CHECK-LABEL: t1(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<46>;
|
||||
; CHECK-NEXT: .reg .b32 %r<41>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [t1_param_0];
|
||||
; CHECK-NEXT: ld.b8 %rd2, [%rd1+8];
|
||||
; CHECK-NEXT: ld.b8 %rd3, [%rd1+9];
|
||||
; CHECK-NEXT: shl.b64 %rd4, %rd3, 8;
|
||||
; CHECK-NEXT: or.b64 %rd5, %rd4, %rd2;
|
||||
; CHECK-NEXT: ld.b8 %rd6, [%rd1+10];
|
||||
; CHECK-NEXT: shl.b64 %rd7, %rd6, 16;
|
||||
; CHECK-NEXT: ld.b8 %rd8, [%rd1+11];
|
||||
; CHECK-NEXT: shl.b64 %rd9, %rd8, 24;
|
||||
; CHECK-NEXT: or.b64 %rd10, %rd9, %rd7;
|
||||
; CHECK-NEXT: or.b64 %rd11, %rd10, %rd5;
|
||||
; CHECK-NEXT: ld.b8 %rd12, [%rd1+12];
|
||||
; CHECK-NEXT: ld.b8 %rd13, [%rd1+13];
|
||||
; CHECK-NEXT: shl.b64 %rd14, %rd13, 8;
|
||||
; CHECK-NEXT: or.b64 %rd15, %rd14, %rd12;
|
||||
; CHECK-NEXT: ld.b8 %rd16, [%rd1+14];
|
||||
; CHECK-NEXT: shl.b64 %rd17, %rd16, 16;
|
||||
; CHECK-NEXT: ld.b8 %rd18, [%rd1+15];
|
||||
; CHECK-NEXT: shl.b64 %rd19, %rd18, 24;
|
||||
; CHECK-NEXT: or.b64 %rd20, %rd19, %rd17;
|
||||
; CHECK-NEXT: or.b64 %rd21, %rd20, %rd15;
|
||||
; CHECK-NEXT: shl.b64 %rd22, %rd21, 32;
|
||||
; CHECK-NEXT: or.b64 %rd23, %rd22, %rd11;
|
||||
; CHECK-NEXT: ld.b8 %rd24, [%rd1];
|
||||
; CHECK-NEXT: ld.b8 %rd25, [%rd1+1];
|
||||
; CHECK-NEXT: shl.b64 %rd26, %rd25, 8;
|
||||
; CHECK-NEXT: or.b64 %rd27, %rd26, %rd24;
|
||||
; CHECK-NEXT: ld.b8 %rd28, [%rd1+2];
|
||||
; CHECK-NEXT: shl.b64 %rd29, %rd28, 16;
|
||||
; CHECK-NEXT: ld.b8 %rd30, [%rd1+3];
|
||||
; CHECK-NEXT: shl.b64 %rd31, %rd30, 24;
|
||||
; CHECK-NEXT: or.b64 %rd32, %rd31, %rd29;
|
||||
; CHECK-NEXT: or.b64 %rd33, %rd32, %rd27;
|
||||
; CHECK-NEXT: ld.b8 %rd34, [%rd1+4];
|
||||
; CHECK-NEXT: ld.b8 %rd35, [%rd1+5];
|
||||
; CHECK-NEXT: shl.b64 %rd36, %rd35, 8;
|
||||
; CHECK-NEXT: or.b64 %rd37, %rd36, %rd34;
|
||||
; CHECK-NEXT: ld.b8 %rd38, [%rd1+6];
|
||||
; CHECK-NEXT: shl.b64 %rd39, %rd38, 16;
|
||||
; CHECK-NEXT: ld.b8 %rd40, [%rd1+7];
|
||||
; CHECK-NEXT: shl.b64 %rd41, %rd40, 24;
|
||||
; CHECK-NEXT: or.b64 %rd42, %rd41, %rd39;
|
||||
; CHECK-NEXT: or.b64 %rd43, %rd42, %rd37;
|
||||
; CHECK-NEXT: shl.b64 %rd44, %rd43, 32;
|
||||
; CHECK-NEXT: or.b64 %rd45, %rd44, %rd33;
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd45, %rd23};
|
||||
; CHECK-NEXT: ld.b8 %r1, [%rd1+12];
|
||||
; CHECK-NEXT: ld.b8 %r2, [%rd1+13];
|
||||
; CHECK-NEXT: shl.b32 %r3, %r2, 8;
|
||||
; CHECK-NEXT: or.b32 %r4, %r3, %r1;
|
||||
; CHECK-NEXT: ld.b8 %r5, [%rd1+14];
|
||||
; CHECK-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; CHECK-NEXT: ld.b8 %r7, [%rd1+15];
|
||||
; CHECK-NEXT: shl.b32 %r8, %r7, 24;
|
||||
; CHECK-NEXT: or.b32 %r9, %r8, %r6;
|
||||
; CHECK-NEXT: or.b32 %r10, %r9, %r4;
|
||||
; CHECK-NEXT: ld.b8 %r11, [%rd1+8];
|
||||
; CHECK-NEXT: ld.b8 %r12, [%rd1+9];
|
||||
; CHECK-NEXT: shl.b32 %r13, %r12, 8;
|
||||
; CHECK-NEXT: or.b32 %r14, %r13, %r11;
|
||||
; CHECK-NEXT: ld.b8 %r15, [%rd1+10];
|
||||
; CHECK-NEXT: shl.b32 %r16, %r15, 16;
|
||||
; CHECK-NEXT: ld.b8 %r17, [%rd1+11];
|
||||
; CHECK-NEXT: shl.b32 %r18, %r17, 24;
|
||||
; CHECK-NEXT: or.b32 %r19, %r18, %r16;
|
||||
; CHECK-NEXT: or.b32 %r20, %r19, %r14;
|
||||
; CHECK-NEXT: ld.b8 %r21, [%rd1+4];
|
||||
; CHECK-NEXT: ld.b8 %r22, [%rd1+5];
|
||||
; CHECK-NEXT: shl.b32 %r23, %r22, 8;
|
||||
; CHECK-NEXT: or.b32 %r24, %r23, %r21;
|
||||
; CHECK-NEXT: ld.b8 %r25, [%rd1+6];
|
||||
; CHECK-NEXT: shl.b32 %r26, %r25, 16;
|
||||
; CHECK-NEXT: ld.b8 %r27, [%rd1+7];
|
||||
; CHECK-NEXT: shl.b32 %r28, %r27, 24;
|
||||
; CHECK-NEXT: or.b32 %r29, %r28, %r26;
|
||||
; CHECK-NEXT: or.b32 %r30, %r29, %r24;
|
||||
; CHECK-NEXT: ld.b8 %r31, [%rd1];
|
||||
; CHECK-NEXT: ld.b8 %r32, [%rd1+1];
|
||||
; CHECK-NEXT: shl.b32 %r33, %r32, 8;
|
||||
; CHECK-NEXT: or.b32 %r34, %r33, %r31;
|
||||
; CHECK-NEXT: ld.b8 %r35, [%rd1+2];
|
||||
; CHECK-NEXT: shl.b32 %r36, %r35, 16;
|
||||
; CHECK-NEXT: ld.b8 %r37, [%rd1+3];
|
||||
; CHECK-NEXT: shl.b32 %r38, %r37, 24;
|
||||
; CHECK-NEXT: or.b32 %r39, %r38, %r36;
|
||||
; CHECK-NEXT: or.b32 %r40, %r39, %r34;
|
||||
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r40, %r30, %r20, %r10};
|
||||
; CHECK-NEXT: ret;
|
||||
%r = load <4 x float>, ptr %p1, align 1
|
||||
ret <4 x float> %r
|
||||
@ -65,19 +62,16 @@ define <4 x float> @t1(ptr %p1) {
|
||||
define <4 x float> @t2(ptr %p1) {
|
||||
; CHECK-LABEL: t2(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<10>;
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [t2_param_0];
|
||||
; CHECK-NEXT: ld.b32 %rd2, [%rd1+8];
|
||||
; CHECK-NEXT: ld.b32 %rd3, [%rd1+12];
|
||||
; CHECK-NEXT: shl.b64 %rd4, %rd3, 32;
|
||||
; CHECK-NEXT: or.b64 %rd5, %rd4, %rd2;
|
||||
; CHECK-NEXT: ld.b32 %rd6, [%rd1];
|
||||
; CHECK-NEXT: ld.b32 %rd7, [%rd1+4];
|
||||
; CHECK-NEXT: shl.b64 %rd8, %rd7, 32;
|
||||
; CHECK-NEXT: or.b64 %rd9, %rd8, %rd6;
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd9, %rd5};
|
||||
; CHECK-NEXT: ld.b32 %r1, [%rd1+12];
|
||||
; CHECK-NEXT: ld.b32 %r2, [%rd1+8];
|
||||
; CHECK-NEXT: ld.b32 %r3, [%rd1+4];
|
||||
; CHECK-NEXT: ld.b32 %r4, [%rd1];
|
||||
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1};
|
||||
; CHECK-NEXT: ret;
|
||||
%r = load <4 x float>, ptr %p1, align 4
|
||||
ret <4 x float> %r
|
||||
@ -86,13 +80,14 @@ define <4 x float> @t2(ptr %p1) {
|
||||
define <4 x float> @t3(ptr %p1) {
|
||||
; CHECK-LABEL: t3(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [t3_param_0];
|
||||
; CHECK-NEXT: ld.b64 %rd2, [%rd1+8];
|
||||
; CHECK-NEXT: ld.b64 %rd3, [%rd1];
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
|
||||
; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1+8];
|
||||
; CHECK-NEXT: ld.v2.b32 {%r3, %r4}, [%rd1];
|
||||
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r3, %r4, %r1, %r2};
|
||||
; CHECK-NEXT: ret;
|
||||
%r = load <4 x float>, ptr %p1, align 8
|
||||
ret <4 x float> %r
|
||||
@ -101,12 +96,13 @@ define <4 x float> @t3(ptr %p1) {
|
||||
define <4 x float> @t4(ptr %p1) {
|
||||
; CHECK-LABEL: t4(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [t4_param_0];
|
||||
; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd3};
|
||||
; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4};
|
||||
; CHECK-NEXT: ret;
|
||||
%r = load <4 x float>, ptr %p1, align 16
|
||||
ret <4 x float> %r
|
||||
@ -189,40 +185,43 @@ define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) {
|
||||
define void @s1(ptr %p1, <4 x float> %v) {
|
||||
; CHECK-LABEL: s1(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<18>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [s1_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s1_param_1];
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s1_param_1];
|
||||
; CHECK-NEXT: cvt.u64.u32 %rd2, %r4;
|
||||
; CHECK-NEXT: st.b8 [%rd1+12], %rd2;
|
||||
; CHECK-NEXT: cvt.u64.u32 %rd3, %r3;
|
||||
; CHECK-NEXT: st.b8 [%rd1+8], %rd3;
|
||||
; CHECK-NEXT: st.b8 [%rd1], %rd2;
|
||||
; CHECK-NEXT: shr.u64 %rd4, %rd3, 56;
|
||||
; CHECK-NEXT: st.b8 [%rd1+15], %rd4;
|
||||
; CHECK-NEXT: shr.u64 %rd5, %rd3, 48;
|
||||
; CHECK-NEXT: st.b8 [%rd1+14], %rd5;
|
||||
; CHECK-NEXT: shr.u64 %rd6, %rd3, 40;
|
||||
; CHECK-NEXT: st.b8 [%rd1+13], %rd6;
|
||||
; CHECK-NEXT: shr.u64 %rd7, %rd3, 32;
|
||||
; CHECK-NEXT: st.b8 [%rd1+12], %rd7;
|
||||
; CHECK-NEXT: shr.u64 %rd8, %rd3, 24;
|
||||
; CHECK-NEXT: st.b8 [%rd1+11], %rd8;
|
||||
; CHECK-NEXT: shr.u64 %rd9, %rd3, 16;
|
||||
; CHECK-NEXT: st.b8 [%rd1+10], %rd9;
|
||||
; CHECK-NEXT: shr.u64 %rd10, %rd3, 8;
|
||||
; CHECK-NEXT: st.b8 [%rd1+9], %rd10;
|
||||
; CHECK-NEXT: shr.u64 %rd11, %rd2, 56;
|
||||
; CHECK-NEXT: st.b8 [%rd1+7], %rd11;
|
||||
; CHECK-NEXT: shr.u64 %rd12, %rd2, 48;
|
||||
; CHECK-NEXT: st.b8 [%rd1+6], %rd12;
|
||||
; CHECK-NEXT: shr.u64 %rd13, %rd2, 40;
|
||||
; CHECK-NEXT: st.b8 [%rd1+5], %rd13;
|
||||
; CHECK-NEXT: shr.u64 %rd14, %rd2, 32;
|
||||
; CHECK-NEXT: st.b8 [%rd1+4], %rd14;
|
||||
; CHECK-NEXT: shr.u64 %rd15, %rd2, 24;
|
||||
; CHECK-NEXT: cvt.u64.u32 %rd4, %r2;
|
||||
; CHECK-NEXT: st.b8 [%rd1+4], %rd4;
|
||||
; CHECK-NEXT: cvt.u64.u32 %rd5, %r1;
|
||||
; CHECK-NEXT: st.b8 [%rd1], %rd5;
|
||||
; CHECK-NEXT: shr.u64 %rd6, %rd2, 24;
|
||||
; CHECK-NEXT: st.b8 [%rd1+15], %rd6;
|
||||
; CHECK-NEXT: shr.u64 %rd7, %rd2, 16;
|
||||
; CHECK-NEXT: st.b8 [%rd1+14], %rd7;
|
||||
; CHECK-NEXT: shr.u64 %rd8, %rd2, 8;
|
||||
; CHECK-NEXT: st.b8 [%rd1+13], %rd8;
|
||||
; CHECK-NEXT: shr.u64 %rd9, %rd3, 24;
|
||||
; CHECK-NEXT: st.b8 [%rd1+11], %rd9;
|
||||
; CHECK-NEXT: shr.u64 %rd10, %rd3, 16;
|
||||
; CHECK-NEXT: st.b8 [%rd1+10], %rd10;
|
||||
; CHECK-NEXT: shr.u64 %rd11, %rd3, 8;
|
||||
; CHECK-NEXT: st.b8 [%rd1+9], %rd11;
|
||||
; CHECK-NEXT: shr.u64 %rd12, %rd4, 24;
|
||||
; CHECK-NEXT: st.b8 [%rd1+7], %rd12;
|
||||
; CHECK-NEXT: shr.u64 %rd13, %rd4, 16;
|
||||
; CHECK-NEXT: st.b8 [%rd1+6], %rd13;
|
||||
; CHECK-NEXT: shr.u64 %rd14, %rd4, 8;
|
||||
; CHECK-NEXT: st.b8 [%rd1+5], %rd14;
|
||||
; CHECK-NEXT: shr.u64 %rd15, %rd5, 24;
|
||||
; CHECK-NEXT: st.b8 [%rd1+3], %rd15;
|
||||
; CHECK-NEXT: shr.u64 %rd16, %rd2, 16;
|
||||
; CHECK-NEXT: shr.u64 %rd16, %rd5, 16;
|
||||
; CHECK-NEXT: st.b8 [%rd1+2], %rd16;
|
||||
; CHECK-NEXT: shr.u64 %rd17, %rd2, 8;
|
||||
; CHECK-NEXT: shr.u64 %rd17, %rd5, 8;
|
||||
; CHECK-NEXT: st.b8 [%rd1+1], %rd17;
|
||||
; CHECK-NEXT: ret;
|
||||
store <4 x float> %v, ptr %p1, align 1
|
||||
@ -232,17 +231,16 @@ define void @s1(ptr %p1, <4 x float> %v) {
|
||||
define void @s2(ptr %p1, <4 x float> %v) {
|
||||
; CHECK-LABEL: s2(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<6>;
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [s2_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s2_param_1];
|
||||
; CHECK-NEXT: st.b32 [%rd1+8], %rd3;
|
||||
; CHECK-NEXT: st.b32 [%rd1], %rd2;
|
||||
; CHECK-NEXT: shr.u64 %rd4, %rd3, 32;
|
||||
; CHECK-NEXT: st.b32 [%rd1+12], %rd4;
|
||||
; CHECK-NEXT: shr.u64 %rd5, %rd2, 32;
|
||||
; CHECK-NEXT: st.b32 [%rd1+4], %rd5;
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s2_param_1];
|
||||
; CHECK-NEXT: st.b32 [%rd1+12], %r4;
|
||||
; CHECK-NEXT: st.b32 [%rd1+8], %r3;
|
||||
; CHECK-NEXT: st.b32 [%rd1+4], %r2;
|
||||
; CHECK-NEXT: st.b32 [%rd1], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
store <4 x float> %v, ptr %p1, align 4
|
||||
ret void
|
||||
@ -251,13 +249,14 @@ define void @s2(ptr %p1, <4 x float> %v) {
|
||||
define void @s3(ptr %p1, <4 x float> %v) {
|
||||
; CHECK-LABEL: s3(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [s3_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s3_param_1];
|
||||
; CHECK-NEXT: st.b64 [%rd1+8], %rd3;
|
||||
; CHECK-NEXT: st.b64 [%rd1], %rd2;
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s3_param_1];
|
||||
; CHECK-NEXT: st.v2.b32 [%rd1+8], {%r3, %r4};
|
||||
; CHECK-NEXT: st.v2.b32 [%rd1], {%r1, %r2};
|
||||
; CHECK-NEXT: ret;
|
||||
store <4 x float> %v, ptr %p1, align 8
|
||||
ret void
|
||||
@ -266,12 +265,13 @@ define void @s3(ptr %p1, <4 x float> %v) {
|
||||
define void @s4(ptr %p1, <4 x float> %v) {
|
||||
; CHECK-LABEL: s4(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [s4_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s4_param_1];
|
||||
; CHECK-NEXT: st.v2.b64 [%rd1], {%rd2, %rd3};
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s4_param_1];
|
||||
; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4};
|
||||
; CHECK-NEXT: ret;
|
||||
store <4 x float> %v, ptr %p1, align 16
|
||||
ret void
|
||||
|
||||
@ -203,27 +203,35 @@ define i64 @mulwideu32(i32 %a, i32 %b) {
|
||||
define i64 @mulwideu7(i7 %a, i7 %b) {
|
||||
; OPT-LABEL: mulwideu7(
|
||||
; OPT: {
|
||||
; OPT-NEXT: .reg .b32 %r<3>;
|
||||
; OPT-NEXT: .reg .b32 %r<5>;
|
||||
; OPT-NEXT: .reg .b64 %rd<2>;
|
||||
; OPT-EMPTY:
|
||||
; OPT-NEXT: // %bb.0:
|
||||
; OPT-NEXT: ld.param.b8 %r1, [mulwideu7_param_0];
|
||||
; OPT-NEXT: ld.param.b8 %r2, [mulwideu7_param_1];
|
||||
; OPT-NEXT: mul.wide.u32 %rd1, %r1, %r2;
|
||||
; OPT-NEXT: ld.param.b8 %r1, [mulwideu7_param_1];
|
||||
; OPT-NEXT: and.b32 %r2, %r1, 127;
|
||||
; OPT-NEXT: ld.param.b8 %r3, [mulwideu7_param_0];
|
||||
; OPT-NEXT: and.b32 %r4, %r3, 127;
|
||||
; OPT-NEXT: mul.wide.u32 %rd1, %r4, %r2;
|
||||
; OPT-NEXT: st.param.b64 [func_retval0], %rd1;
|
||||
; OPT-NEXT: ret;
|
||||
;
|
||||
; NOOPT-LABEL: mulwideu7(
|
||||
; NOOPT: {
|
||||
; NOOPT-NEXT: .reg .b16 %rs<3>;
|
||||
; NOOPT-NEXT: .reg .b16 %rs<9>;
|
||||
; NOOPT-NEXT: .reg .b64 %rd<6>;
|
||||
; NOOPT-EMPTY:
|
||||
; NOOPT-NEXT: // %bb.0:
|
||||
; NOOPT-NEXT: ld.param.b8 %rs2, [mulwideu7_param_1];
|
||||
; NOOPT-NEXT: ld.param.b8 %rs1, [mulwideu7_param_0];
|
||||
; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs1;
|
||||
; NOOPT-NEXT: ld.param.b8 %rs3, [mulwideu7_param_0+1];
|
||||
; NOOPT-NEXT: shl.b16 %rs4, %rs3, 8;
|
||||
; NOOPT-NEXT: ld.param.b8 %rs5, [mulwideu7_param_0];
|
||||
; NOOPT-NEXT: or.b16 %rs1, %rs4, %rs5;
|
||||
; NOOPT-NEXT: ld.param.b8 %rs6, [mulwideu7_param_1+1];
|
||||
; NOOPT-NEXT: shl.b16 %rs7, %rs6, 8;
|
||||
; NOOPT-NEXT: ld.param.b8 %rs8, [mulwideu7_param_1];
|
||||
; NOOPT-NEXT: or.b16 %rs2, %rs7, %rs8;
|
||||
; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs5;
|
||||
; NOOPT-NEXT: and.b64 %rd2, %rd1, 127;
|
||||
; NOOPT-NEXT: cvt.u64.u16 %rd3, %rs2;
|
||||
; NOOPT-NEXT: cvt.u64.u16 %rd3, %rs8;
|
||||
; NOOPT-NEXT: and.b64 %rd4, %rd3, 127;
|
||||
; NOOPT-NEXT: mul.lo.s64 %rd5, %rd2, %rd4;
|
||||
; NOOPT-NEXT: st.param.b64 [func_retval0], %rd5;
|
||||
@ -242,26 +250,32 @@ define i64 @mulwides7(i7 %a, i7 %b) {
|
||||
; OPT-EMPTY:
|
||||
; OPT-NEXT: // %bb.0:
|
||||
; OPT-NEXT: ld.param.b8 %r1, [mulwides7_param_0];
|
||||
; OPT-NEXT: bfe.s32 %r2, %r1, 0, 7;
|
||||
; OPT-NEXT: ld.param.b8 %r3, [mulwides7_param_1];
|
||||
; OPT-NEXT: bfe.s32 %r4, %r3, 0, 7;
|
||||
; OPT-NEXT: mul.wide.s32 %rd1, %r2, %r4;
|
||||
; OPT-NEXT: ld.param.b8 %r2, [mulwides7_param_1];
|
||||
; OPT-NEXT: bfe.s32 %r3, %r2, 0, 7;
|
||||
; OPT-NEXT: bfe.s32 %r4, %r1, 0, 7;
|
||||
; OPT-NEXT: mul.wide.s32 %rd1, %r4, %r3;
|
||||
; OPT-NEXT: st.param.b64 [func_retval0], %rd1;
|
||||
; OPT-NEXT: ret;
|
||||
;
|
||||
; NOOPT-LABEL: mulwides7(
|
||||
; NOOPT: {
|
||||
; NOOPT-NEXT: .reg .b16 %rs<3>;
|
||||
; NOOPT-NEXT: .reg .b16 %rs<9>;
|
||||
; NOOPT-NEXT: .reg .b64 %rd<6>;
|
||||
; NOOPT-EMPTY:
|
||||
; NOOPT-NEXT: // %bb.0:
|
||||
; NOOPT-NEXT: ld.param.b8 %rs2, [mulwides7_param_1];
|
||||
; NOOPT-NEXT: ld.param.b8 %rs1, [mulwides7_param_0];
|
||||
; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs1;
|
||||
; NOOPT-NEXT: bfe.s64 %rd2, %rd1, 0, 7;
|
||||
; NOOPT-NEXT: cvt.u64.u16 %rd3, %rs2;
|
||||
; NOOPT-NEXT: bfe.s64 %rd4, %rd3, 0, 7;
|
||||
; NOOPT-NEXT: mul.lo.s64 %rd5, %rd2, %rd4;
|
||||
; NOOPT-NEXT: ld.param.b8 %rs3, [mulwides7_param_0+1];
|
||||
; NOOPT-NEXT: shl.b16 %rs4, %rs3, 8;
|
||||
; NOOPT-NEXT: ld.param.b8 %rs5, [mulwides7_param_0];
|
||||
; NOOPT-NEXT: or.b16 %rs1, %rs4, %rs5;
|
||||
; NOOPT-NEXT: ld.param.b8 %rs6, [mulwides7_param_1];
|
||||
; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs6;
|
||||
; NOOPT-NEXT: cvt.u64.u16 %rd2, %rs5;
|
||||
; NOOPT-NEXT: ld.param.b8 %rs7, [mulwides7_param_1+1];
|
||||
; NOOPT-NEXT: shl.b16 %rs8, %rs7, 8;
|
||||
; NOOPT-NEXT: or.b16 %rs2, %rs8, %rs6;
|
||||
; NOOPT-NEXT: bfe.s64 %rd3, %rd2, 0, 7;
|
||||
; NOOPT-NEXT: bfe.s64 %rd4, %rd1, 0, 7;
|
||||
; NOOPT-NEXT: mul.lo.s64 %rd5, %rd3, %rd4;
|
||||
; NOOPT-NEXT: st.param.b64 [func_retval0], %rd5;
|
||||
; NOOPT-NEXT: ret;
|
||||
%val0 = sext i7 %a to i64
|
||||
|
||||
33
llvm/test/CodeGen/NVPTX/no-f32x2.ll
Normal file
33
llvm/test/CodeGen/NVPTX/no-f32x2.ll
Normal file
@ -0,0 +1,33 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -mcpu=sm_100 | FileCheck %s --check-prefix=F32X2
|
||||
; RUN: llc < %s -mcpu=sm_90 | FileCheck %s --check-prefix=NOF32X2
|
||||
; RUN: llc < %s -mcpu=sm_100 -nvptx-no-f32x2 | FileCheck %s --check-prefix=NOF32X2
|
||||
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
define <2 x float> @test(<2 x float> %a, <2 x float> %b) {
|
||||
; F32X2-LABEL: test(
|
||||
; F32X2: {
|
||||
; F32X2-NEXT: .reg .b64 %rd<4>;
|
||||
; F32X2-EMPTY:
|
||||
; F32X2-NEXT: // %bb.0:
|
||||
; F32X2-NEXT: ld.param.b64 %rd1, [test_param_0];
|
||||
; F32X2-NEXT: ld.param.b64 %rd2, [test_param_1];
|
||||
; F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
|
||||
; F32X2-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; F32X2-NEXT: ret;
|
||||
;
|
||||
; NOF32X2-LABEL: test(
|
||||
; NOF32X2: {
|
||||
; NOF32X2-NEXT: .reg .b32 %r<7>;
|
||||
; NOF32X2-EMPTY:
|
||||
; NOF32X2-NEXT: // %bb.0:
|
||||
; NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_param_0];
|
||||
; NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_param_1];
|
||||
; NOF32X2-NEXT: add.rn.f32 %r5, %r2, %r4;
|
||||
; NOF32X2-NEXT: add.rn.f32 %r6, %r1, %r3;
|
||||
; NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
|
||||
; NOF32X2-NEXT: ret;
|
||||
%c = fadd <2 x float> %a, %b
|
||||
ret <2 x float> %c
|
||||
}
|
||||
@ -523,8 +523,7 @@ define <9 x half> @test_v9f16(<9 x half> %a) {
|
||||
; CHECK: .func (.param .b32 func_retval0)
|
||||
; CHECK-LABEL: test_i19(
|
||||
; CHECK-NEXT: .param .b32 test_i19_param_0
|
||||
; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i19_param_0];
|
||||
; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i19_param_0+2];
|
||||
; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i19_param_0];
|
||||
; CHECK: .param .b32 param0;
|
||||
; CHECK: .param .b32 retval0;
|
||||
; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
|
||||
@ -540,8 +539,7 @@ define i19 @test_i19(i19 %a) {
|
||||
; CHECK: .func (.param .b32 func_retval0)
|
||||
; CHECK-LABEL: test_i23(
|
||||
; CHECK-NEXT: .param .b32 test_i23_param_0
|
||||
; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i23_param_0];
|
||||
; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i23_param_0+2];
|
||||
; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i23_param_0];
|
||||
; CHECK: .param .b32 param0;
|
||||
; CHECK: .param .b32 retval0;
|
||||
; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
|
||||
@ -557,8 +555,7 @@ define i23 @test_i23(i23 %a) {
|
||||
; CHECK: .func (.param .b32 func_retval0)
|
||||
; CHECK-LABEL: test_i24(
|
||||
; CHECK-NEXT: .param .b32 test_i24_param_0
|
||||
; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i24_param_0+2];
|
||||
; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i24_param_0];
|
||||
; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i24_param_0];
|
||||
; CHECK: .param .b32 param0;
|
||||
; CHECK: .param .b32 retval0;
|
||||
; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
|
||||
@ -678,8 +675,7 @@ define float @test_f32(float %a) {
|
||||
; CHECK: .func (.param .b64 func_retval0)
|
||||
; CHECK-LABEL: test_i40(
|
||||
; CHECK-NEXT: .param .b64 test_i40_param_0
|
||||
; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i40_param_0+4];
|
||||
; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i40_param_0];
|
||||
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i40_param_0];
|
||||
; CHECK: .param .b64 param0;
|
||||
; CHECK: .param .b64 retval0;
|
||||
; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
|
||||
@ -695,8 +691,7 @@ define i40 @test_i40(i40 %a) {
|
||||
; CHECK: .func (.param .b64 func_retval0)
|
||||
; CHECK-LABEL: test_i47(
|
||||
; CHECK-NEXT: .param .b64 test_i47_param_0
|
||||
; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i47_param_0+4];
|
||||
; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i47_param_0];
|
||||
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i47_param_0];
|
||||
; CHECK: .param .b64 param0;
|
||||
; CHECK: .param .b64 retval0;
|
||||
; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
|
||||
@ -712,8 +707,7 @@ define i47 @test_i47(i47 %a) {
|
||||
; CHECK: .func (.param .b64 func_retval0)
|
||||
; CHECK-LABEL: test_i48(
|
||||
; CHECK-NEXT: .param .b64 test_i48_param_0
|
||||
; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i48_param_0+4];
|
||||
; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i48_param_0];
|
||||
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i48_param_0];
|
||||
; CHECK: .param .b64 param0;
|
||||
; CHECK: .param .b64 retval0;
|
||||
; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
|
||||
@ -729,9 +723,7 @@ define i48 @test_i48(i48 %a) {
|
||||
; CHECK: .func (.param .b64 func_retval0)
|
||||
; CHECK-LABEL: test_i51(
|
||||
; CHECK-NEXT: .param .b64 test_i51_param_0
|
||||
; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i51_param_0+6];
|
||||
; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i51_param_0+4];
|
||||
; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i51_param_0];
|
||||
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i51_param_0];
|
||||
; CHECK: .param .b64 param0;
|
||||
; CHECK: .param .b64 retval0;
|
||||
; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
|
||||
@ -747,9 +739,7 @@ define i51 @test_i51(i51 %a) {
|
||||
; CHECK: .func (.param .b64 func_retval0)
|
||||
; CHECK-LABEL: test_i56(
|
||||
; CHECK-NEXT: .param .b64 test_i56_param_0
|
||||
; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i56_param_0+6];
|
||||
; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i56_param_0+4];
|
||||
; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i56_param_0];
|
||||
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i56_param_0];
|
||||
; CHECK: .param .b64 param0;
|
||||
; CHECK: .param .b64 retval0;
|
||||
; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
|
||||
|
||||
@ -17,17 +17,16 @@ define ptx_kernel void @Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel(<2 x float> %0) {
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .pred %p<2>;
|
||||
; CHECK-NEXT: .reg .b16 %rs<2>;
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %.preheader15
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel_param_0];
|
||||
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel_param_0];
|
||||
; CHECK-NEXT: setp.eq.f32 %p1, %r1, 0f00000000;
|
||||
; CHECK-NEXT: selp.b16 %rs1, 1, 0, %p1;
|
||||
; CHECK-NEXT: $L__BB0_1: // =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: mov.b64 %rd2, 0;
|
||||
; CHECK-NEXT: st.b8 [%rd2], %rs1;
|
||||
; CHECK-NEXT: mov.b64 %rd1, 0;
|
||||
; CHECK-NEXT: st.b8 [%rd1], %rs1;
|
||||
; CHECK-NEXT: bra.uni $L__BB0_1;
|
||||
.preheader15:
|
||||
br label %1
|
||||
|
||||
@ -25,11 +25,11 @@ define float @test_gv_float() {
|
||||
define <2 x float> @test_gv_float2() {
|
||||
; CHECK-LABEL: test_gv_float2(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.global.nc.b64 %rd1, [gv_float2];
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
|
||||
; CHECK-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [gv_float2];
|
||||
; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2};
|
||||
; CHECK-NEXT: ret;
|
||||
%v = load <2 x float>, ptr @gv_float2
|
||||
ret <2 x float> %v
|
||||
@ -38,11 +38,11 @@ define <2 x float> @test_gv_float2() {
|
||||
define <4 x float> @test_gv_float4() {
|
||||
; CHECK-LABEL: test_gv_float4(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.global.nc.v2.b64 {%rd1, %rd2}, [gv_float4];
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
|
||||
; CHECK-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [gv_float4];
|
||||
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4};
|
||||
; CHECK-NEXT: ret;
|
||||
%v = load <4 x float>, ptr @gv_float4
|
||||
ret <4 x float> %v
|
||||
|
||||
@ -86,28 +86,46 @@ define half @reduce_fadd_half_reassoc_nonpow2(<7 x half> %in) {
|
||||
}
|
||||
|
||||
define float @reduce_fadd_float(<8 x float> %in) {
|
||||
; CHECK-LABEL: reduce_fadd_float(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b32 %r<17>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16];
|
||||
; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4;
|
||||
; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3;
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0];
|
||||
; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2;
|
||||
; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1;
|
||||
; CHECK-NEXT: add.rn.f32 %r9, %r7, 0f00000000;
|
||||
; CHECK-NEXT: add.rn.f32 %r10, %r9, %r8;
|
||||
; CHECK-NEXT: add.rn.f32 %r11, %r10, %r5;
|
||||
; CHECK-NEXT: add.rn.f32 %r12, %r11, %r6;
|
||||
; CHECK-NEXT: add.rn.f32 %r13, %r12, %r3;
|
||||
; CHECK-NEXT: add.rn.f32 %r14, %r13, %r4;
|
||||
; CHECK-NEXT: add.rn.f32 %r15, %r14, %r1;
|
||||
; CHECK-NEXT: add.rn.f32 %r16, %r15, %r2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r16;
|
||||
; CHECK-NEXT: ret;
|
||||
; CHECK-SM80-LABEL: reduce_fadd_float(
|
||||
; CHECK-SM80: {
|
||||
; CHECK-SM80-NEXT: .reg .b32 %r<17>;
|
||||
; CHECK-SM80-EMPTY:
|
||||
; CHECK-SM80-NEXT: // %bb.0:
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0];
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r9, %r1, 0f00000000;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r10, %r9, %r2;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r11, %r10, %r3;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r12, %r11, %r4;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r13, %r12, %r5;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r14, %r13, %r6;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r7;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r16, %r15, %r8;
|
||||
; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r16;
|
||||
; CHECK-SM80-NEXT: ret;
|
||||
;
|
||||
; CHECK-SM100-LABEL: reduce_fadd_float(
|
||||
; CHECK-SM100: {
|
||||
; CHECK-SM100-NEXT: .reg .b32 %r<17>;
|
||||
; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-SM100-EMPTY:
|
||||
; CHECK-SM100-NEXT: // %bb.0:
|
||||
; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16];
|
||||
; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4;
|
||||
; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd3;
|
||||
; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0];
|
||||
; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd2;
|
||||
; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd1;
|
||||
; CHECK-SM100-NEXT: add.rn.f32 %r9, %r7, 0f00000000;
|
||||
; CHECK-SM100-NEXT: add.rn.f32 %r10, %r9, %r8;
|
||||
; CHECK-SM100-NEXT: add.rn.f32 %r11, %r10, %r5;
|
||||
; CHECK-SM100-NEXT: add.rn.f32 %r12, %r11, %r6;
|
||||
; CHECK-SM100-NEXT: add.rn.f32 %r13, %r12, %r3;
|
||||
; CHECK-SM100-NEXT: add.rn.f32 %r14, %r13, %r4;
|
||||
; CHECK-SM100-NEXT: add.rn.f32 %r15, %r14, %r1;
|
||||
; CHECK-SM100-NEXT: add.rn.f32 %r16, %r15, %r2;
|
||||
; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r16;
|
||||
; CHECK-SM100-NEXT: ret;
|
||||
%res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
|
||||
ret float %res
|
||||
}
|
||||
@ -116,20 +134,15 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) {
|
||||
; CHECK-SM80-LABEL: reduce_fadd_float_reassoc(
|
||||
; CHECK-SM80: {
|
||||
; CHECK-SM80-NEXT: .reg .b32 %r<17>;
|
||||
; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-SM80-EMPTY:
|
||||
; CHECK-SM80-NEXT: // %bb.0:
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_reassoc_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_reassoc_param_0];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r5, %r4, %r2;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r10, %r9, %r7;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r11, %r10, %r5;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r12, %r3, %r1;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r13, %r8, %r6;
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0];
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r9, %r4, %r8;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r10, %r2, %r6;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r11, %r10, %r9;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r12, %r3, %r7;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r13, %r1, %r5;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r14, %r13, %r12;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r11;
|
||||
; CHECK-SM80-NEXT: add.rn.f32 %r16, %r15, 0f00000000;
|
||||
@ -272,27 +285,44 @@ define half @reduce_fmul_half_reassoc_nonpow2(<7 x half> %in) {
|
||||
}
|
||||
|
||||
define float @reduce_fmul_float(<8 x float> %in) {
|
||||
; CHECK-LABEL: reduce_fmul_float(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b32 %r<16>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16];
|
||||
; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4;
|
||||
; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3;
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0];
|
||||
; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2;
|
||||
; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1;
|
||||
; CHECK-NEXT: mul.rn.f32 %r9, %r7, %r8;
|
||||
; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r5;
|
||||
; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r6;
|
||||
; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r3;
|
||||
; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r4;
|
||||
; CHECK-NEXT: mul.rn.f32 %r14, %r13, %r1;
|
||||
; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
|
||||
; CHECK-NEXT: ret;
|
||||
; CHECK-SM80-LABEL: reduce_fmul_float(
|
||||
; CHECK-SM80: {
|
||||
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
|
||||
; CHECK-SM80-EMPTY:
|
||||
; CHECK-SM80-NEXT: // %bb.0:
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0];
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r1, %r2;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r9, %r3;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r10, %r4;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r11, %r5;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r12, %r6;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r13, %r7;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r8;
|
||||
; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
|
||||
; CHECK-SM80-NEXT: ret;
|
||||
;
|
||||
; CHECK-SM100-LABEL: reduce_fmul_float(
|
||||
; CHECK-SM100: {
|
||||
; CHECK-SM100-NEXT: .reg .b32 %r<16>;
|
||||
; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-SM100-EMPTY:
|
||||
; CHECK-SM100-NEXT: // %bb.0:
|
||||
; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16];
|
||||
; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4;
|
||||
; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd3;
|
||||
; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0];
|
||||
; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd2;
|
||||
; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd1;
|
||||
; CHECK-SM100-NEXT: mul.rn.f32 %r9, %r7, %r8;
|
||||
; CHECK-SM100-NEXT: mul.rn.f32 %r10, %r9, %r5;
|
||||
; CHECK-SM100-NEXT: mul.rn.f32 %r11, %r10, %r6;
|
||||
; CHECK-SM100-NEXT: mul.rn.f32 %r12, %r11, %r3;
|
||||
; CHECK-SM100-NEXT: mul.rn.f32 %r13, %r12, %r4;
|
||||
; CHECK-SM100-NEXT: mul.rn.f32 %r14, %r13, %r1;
|
||||
; CHECK-SM100-NEXT: mul.rn.f32 %r15, %r14, %r2;
|
||||
; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15;
|
||||
; CHECK-SM100-NEXT: ret;
|
||||
%res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
|
||||
ret float %res
|
||||
}
|
||||
@ -301,20 +331,15 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) {
|
||||
; CHECK-SM80-LABEL: reduce_fmul_float_reassoc(
|
||||
; CHECK-SM80: {
|
||||
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
|
||||
; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-SM80-EMPTY:
|
||||
; CHECK-SM80-NEXT: // %bb.0:
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_reassoc_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_reassoc_param_0];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r5, %r4, %r2;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r9, %r7;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r10, %r5;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r3, %r1;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r8, %r6;
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0];
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r4, %r8;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r2, %r6;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r10, %r9;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r3, %r7;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r1, %r5;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r13, %r12;
|
||||
; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r11;
|
||||
; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
|
||||
@ -495,15 +520,10 @@ define float @reduce_fmax_float(<8 x float> %in) {
|
||||
; CHECK-SM80-LABEL: reduce_fmax_float(
|
||||
; CHECK-SM80: {
|
||||
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
|
||||
; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-SM80-EMPTY:
|
||||
; CHECK-SM80-NEXT: // %bb.0:
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_param_0];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_param_0+16];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0];
|
||||
; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8;
|
||||
; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6;
|
||||
; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9;
|
||||
@ -540,15 +560,10 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) {
|
||||
; CHECK-SM80-LABEL: reduce_fmax_float_reassoc(
|
||||
; CHECK-SM80: {
|
||||
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
|
||||
; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-SM80-EMPTY:
|
||||
; CHECK-SM80-NEXT: // %bb.0:
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_reassoc_param_0];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_reassoc_param_0+16];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0];
|
||||
; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8;
|
||||
; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6;
|
||||
; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9;
|
||||
@ -620,15 +635,10 @@ define float @reduce_fmax_float_nnan(<8 x float> %in) {
|
||||
; CHECK-SM80-LABEL: reduce_fmax_float_nnan(
|
||||
; CHECK-SM80: {
|
||||
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
|
||||
; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-SM80-EMPTY:
|
||||
; CHECK-SM80-NEXT: // %bb.0:
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_nnan_param_0];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_nnan_param_0+16];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_nnan_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_nnan_param_0];
|
||||
; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8;
|
||||
; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6;
|
||||
; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9;
|
||||
@ -809,15 +819,10 @@ define float @reduce_fmin_float(<8 x float> %in) {
|
||||
; CHECK-SM80-LABEL: reduce_fmin_float(
|
||||
; CHECK-SM80: {
|
||||
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
|
||||
; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-SM80-EMPTY:
|
||||
; CHECK-SM80-NEXT: // %bb.0:
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_param_0];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_param_0+16];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0];
|
||||
; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8;
|
||||
; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6;
|
||||
; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9;
|
||||
@ -854,15 +859,10 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) {
|
||||
; CHECK-SM80-LABEL: reduce_fmin_float_reassoc(
|
||||
; CHECK-SM80: {
|
||||
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
|
||||
; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-SM80-EMPTY:
|
||||
; CHECK-SM80-NEXT: // %bb.0:
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_reassoc_param_0];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_reassoc_param_0+16];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0];
|
||||
; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8;
|
||||
; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6;
|
||||
; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9;
|
||||
@ -934,15 +934,10 @@ define float @reduce_fmin_float_nnan(<8 x float> %in) {
|
||||
; CHECK-SM80-LABEL: reduce_fmin_float_nnan(
|
||||
; CHECK-SM80: {
|
||||
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
|
||||
; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-SM80-EMPTY:
|
||||
; CHECK-SM80-NEXT: // %bb.0:
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_nnan_param_0];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_nnan_param_0+16];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_nnan_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_nnan_param_0];
|
||||
; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8;
|
||||
; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6;
|
||||
; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9;
|
||||
@ -1078,15 +1073,10 @@ define float @reduce_fmaximum_float(<8 x float> %in) {
|
||||
; CHECK-SM80-LABEL: reduce_fmaximum_float(
|
||||
; CHECK-SM80: {
|
||||
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
|
||||
; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-SM80-EMPTY:
|
||||
; CHECK-SM80-NEXT: // %bb.0:
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_param_0];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_param_0+16];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0];
|
||||
; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r7, %r8;
|
||||
; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r5, %r6;
|
||||
; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r9;
|
||||
@ -1123,15 +1113,10 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) {
|
||||
; CHECK-SM80-LABEL: reduce_fmaximum_float_reassoc(
|
||||
; CHECK-SM80: {
|
||||
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
|
||||
; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-SM80-EMPTY:
|
||||
; CHECK-SM80-NEXT: // %bb.0:
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_reassoc_param_0];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_reassoc_param_0+16];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0];
|
||||
; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r7, %r8;
|
||||
; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r5, %r6;
|
||||
; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r9;
|
||||
@ -1267,15 +1252,10 @@ define float @reduce_fminimum_float(<8 x float> %in) {
|
||||
; CHECK-SM80-LABEL: reduce_fminimum_float(
|
||||
; CHECK-SM80: {
|
||||
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
|
||||
; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-SM80-EMPTY:
|
||||
; CHECK-SM80-NEXT: // %bb.0:
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_param_0];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_param_0+16];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0];
|
||||
; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r7, %r8;
|
||||
; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r5, %r6;
|
||||
; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r9;
|
||||
@ -1312,15 +1292,10 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) {
|
||||
; CHECK-SM80-LABEL: reduce_fminimum_float_reassoc(
|
||||
; CHECK-SM80: {
|
||||
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
|
||||
; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-SM80-EMPTY:
|
||||
; CHECK-SM80-NEXT: // %bb.0:
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_reassoc_param_0];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
|
||||
; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_reassoc_param_0+16];
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
|
||||
; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16];
|
||||
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0];
|
||||
; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r7, %r8;
|
||||
; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r5, %r6;
|
||||
; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r9;
|
||||
|
||||
@ -7,17 +7,17 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
|
||||
define <16 x float> @test_v16f32(<16 x float> %a) {
|
||||
; CHECK-LABEL: test_v16f32(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<9>;
|
||||
; CHECK-NEXT: .reg .b32 %r<17>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v16f32_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v16f32_param_0+16];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_v16f32_param_0+32];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [test_v16f32_param_0+48];
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0+48], {%rd7, %rd8};
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0+32], {%rd5, %rd6};
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v16f32_param_0];
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_v16f32_param_0+16];
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [test_v16f32_param_0+32];
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [test_v16f32_param_0+48];
|
||||
; CHECK-NEXT: st.param.v4.b32 [func_retval0+48], {%r13, %r14, %r15, %r16};
|
||||
; CHECK-NEXT: st.param.v4.b32 [func_retval0+32], {%r9, %r10, %r11, %r12};
|
||||
; CHECK-NEXT: st.param.v4.b32 [func_retval0+16], {%r5, %r6, %r7, %r8};
|
||||
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4};
|
||||
; CHECK-NEXT: ret;
|
||||
ret <16 x float> %a
|
||||
}
|
||||
@ -25,13 +25,13 @@ define <16 x float> @test_v16f32(<16 x float> %a) {
|
||||
define <8 x float> @test_v8f32(<8 x float> %a) {
|
||||
; CHECK-LABEL: test_v8f32(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-NEXT: .reg .b32 %r<9>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v8f32_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v8f32_param_0+16];
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v8f32_param_0];
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_v8f32_param_0+16];
|
||||
; CHECK-NEXT: st.param.v4.b32 [func_retval0+16], {%r5, %r6, %r7, %r8};
|
||||
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4};
|
||||
; CHECK-NEXT: ret;
|
||||
ret <8 x float> %a
|
||||
}
|
||||
@ -39,11 +39,11 @@ define <8 x float> @test_v8f32(<8 x float> %a) {
|
||||
define <4 x float> @test_v4f32(<4 x float> %a) {
|
||||
; CHECK-LABEL: test_v4f32(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v4f32_param_0];
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v4f32_param_0];
|
||||
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4};
|
||||
; CHECK-NEXT: ret;
|
||||
ret <4 x float> %a
|
||||
}
|
||||
@ -51,11 +51,11 @@ define <4 x float> @test_v4f32(<4 x float> %a) {
|
||||
define <2 x float> @test_v2f32(<2 x float> %a) {
|
||||
; CHECK-LABEL: test_v2f32(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_0];
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v2f32_param_0];
|
||||
; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2};
|
||||
; CHECK-NEXT: ret;
|
||||
ret <2 x float> %a
|
||||
}
|
||||
@ -64,14 +64,13 @@ define <2 x float> @test_v2f32(<2 x float> %a) {
|
||||
define <3 x float> @test_v3f32(<3 x float> %a) {
|
||||
; CHECK-LABEL: test_v3f32(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_v3f32_param_0+8];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0+8], %r1;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v3f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_v3f32_param_0+8];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0+8], %r3;
|
||||
; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2};
|
||||
; CHECK-NEXT: ret;
|
||||
ret <3 x float> %a
|
||||
}
|
||||
|
||||
@ -206,18 +206,18 @@ define void @extv8f16_global_a16(ptr addrspace(1) noalias readonly align 16 %dst
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [extv8f16_global_a16_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [extv8f16_global_a16_param_1];
|
||||
; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r5, %rs2;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r6, %rs1;
|
||||
; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r7, %rs4;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r8, %rs3;
|
||||
; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r4;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r9, %rs6;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r10, %rs5;
|
||||
; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r11, %rs8;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r12, %rs7;
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3;
|
||||
; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r4;
|
||||
; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
|
||||
; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r2;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r5, %rs8;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r6, %rs7;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r7, %rs6;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r8, %rs5;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r9, %rs4;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r10, %rs3;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r11, %rs2;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r12, %rs1;
|
||||
; CHECK-NEXT: st.global.v4.b32 [%rd1+16], {%r12, %r11, %r10, %r9};
|
||||
; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
|
||||
; CHECK-NEXT: ret;
|
||||
@ -270,18 +270,18 @@ define void @extv8f16_generic_a16(ptr noalias readonly align 16 %dst, ptr noalia
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [extv8f16_generic_a16_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [extv8f16_generic_a16_param_1];
|
||||
; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r5, %rs2;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r6, %rs1;
|
||||
; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r7, %rs4;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r8, %rs3;
|
||||
; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r4;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r9, %rs6;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r10, %rs5;
|
||||
; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r11, %rs8;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r12, %rs7;
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3;
|
||||
; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r4;
|
||||
; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
|
||||
; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r2;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r5, %rs8;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r6, %rs7;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r7, %rs6;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r8, %rs5;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r9, %rs4;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r10, %rs3;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r11, %rs2;
|
||||
; CHECK-NEXT: cvt.f32.f16 %r12, %rs1;
|
||||
; CHECK-NEXT: st.v4.b32 [%rd1+16], {%r12, %r11, %r10, %r9};
|
||||
; CHECK-NEXT: st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@ -5,12 +5,13 @@
|
||||
define void @foo1(<2 x float> %val, ptr %ptr) {
|
||||
; CHECK-LABEL: foo1(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [foo1_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [foo1_param_1];
|
||||
; CHECK-NEXT: st.b64 [%rd2], %rd1;
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [foo1_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [foo1_param_1];
|
||||
; CHECK-NEXT: st.v2.b32 [%rd1], {%r1, %r2};
|
||||
; CHECK-NEXT: ret;
|
||||
store <2 x float> %val, ptr %ptr
|
||||
ret void
|
||||
@ -19,12 +20,13 @@ define void @foo1(<2 x float> %val, ptr %ptr) {
|
||||
define void @foo2(<4 x float> %val, ptr %ptr) {
|
||||
; CHECK-LABEL: foo2(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [foo2_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd3, [foo2_param_1];
|
||||
; CHECK-NEXT: st.v2.b64 [%rd3], {%rd1, %rd2};
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [foo2_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [foo2_param_1];
|
||||
; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4};
|
||||
; CHECK-NEXT: ret;
|
||||
store <4 x float> %val, ptr %ptr
|
||||
ret void
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user