Revert "[LSV] Merge contiguous chains across scalar types" (#170381)
Reverts llvm/llvm-project#154069. I pointed out a number of issues post-merge, most importantly examples of miscompiles: https://github.com/llvm/llvm-project/pull/154069#issuecomment-3603854626. While the motivation of the change is clear, I think the implementation approach is flawed. It seems like the goal is to allow elements like `load <2xi16>` and `load i32` to be vectorized together despite the current algorithm not grouping them into the same equivalence classes. I personally think that if we want to attempt this it should be a more wholistic approach, maybe even redefining the concept of an equivalence class. This current solution seems like it would be really hard to do bug-free, and even if the bugs were not present, it is only able to merge chains that happen to be adjacent to each other after `splitChainByContiguity`, which seems like it is leaving things up to chance whether this optimization kicks in. But we can discuss more in the re-land. Maybe the broader approach I'm proposing is too difficult, and a narrow optimization is worthwhile. Regardless, this should be reverted, it needs more iteration before it is correct.
This commit is contained in:
parent
e5f1d025aa
commit
9c78bc5de4
@ -431,7 +431,7 @@ LLVM_ABI void combineAAMetadata(Instruction *K, const Instruction *J);
|
||||
|
||||
/// Copy the metadata from the source instruction to the destination (the
|
||||
/// replacement for the source instruction).
|
||||
LLVM_ABI void copyMetadataForAccess(Instruction &Dest, Instruction &Source);
|
||||
LLVM_ABI void copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source);
|
||||
|
||||
/// Patch the replacement so that it is not more restrictive than the value
|
||||
/// being replaced. It assumes that the replacement does not get moved from
|
||||
|
||||
@ -1035,7 +1035,7 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
|
||||
LoadInst *NewLI = IRB.CreateAlignedLoad(
|
||||
LoadableType, NewPtr, commonAlignment(OrigLI.getAlign(), ByteOffset),
|
||||
Name + ".off." + Twine(ByteOffset));
|
||||
copyMetadataForAccess(*NewLI, OrigLI);
|
||||
copyMetadataForLoad(*NewLI, OrigLI);
|
||||
NewLI->setAAMetadata(
|
||||
AANodes.adjustForAccess(ByteOffset, LoadableType, DL));
|
||||
NewLI->setAtomic(OrigLI.getOrdering(), OrigLI.getSyncScopeID());
|
||||
|
||||
@ -415,7 +415,7 @@ void PointerReplacer::replace(Instruction *I) {
|
||||
LT->getAlign(), LT->getOrdering(),
|
||||
LT->getSyncScopeID());
|
||||
NewI->takeName(LT);
|
||||
copyMetadataForAccess(*NewI, *LT);
|
||||
copyMetadataForLoad(*NewI, *LT);
|
||||
|
||||
IC.InsertNewInstWith(NewI, LT->getIterator());
|
||||
IC.replaceInstUsesWith(*LT, NewI);
|
||||
@ -606,7 +606,7 @@ LoadInst *InstCombinerImpl::combineLoadToNewType(LoadInst &LI, Type *NewTy,
|
||||
Builder.CreateAlignedLoad(NewTy, LI.getPointerOperand(), LI.getAlign(),
|
||||
LI.isVolatile(), LI.getName() + Suffix);
|
||||
NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
|
||||
copyMetadataForAccess(*NewLoad, LI);
|
||||
copyMetadataForLoad(*NewLoad, LI);
|
||||
return NewLoad;
|
||||
}
|
||||
|
||||
|
||||
@ -3272,7 +3272,7 @@ private:
|
||||
// Copy any metadata that is valid for the new load. This may require
|
||||
// conversion to a different kind of metadata, e.g. !nonnull might change
|
||||
// to !range or vice versa.
|
||||
copyMetadataForAccess(*NewLI, LI);
|
||||
copyMetadataForLoad(*NewLI, LI);
|
||||
|
||||
// Do this after copyMetadataForLoad() to preserve the TBAA shift.
|
||||
if (AATags)
|
||||
|
||||
@ -3100,70 +3100,54 @@ void llvm::combineAAMetadata(Instruction *K, const Instruction *J) {
|
||||
combineMetadata(K, J, /*DoesKMove=*/true, /*AAOnly=*/true);
|
||||
}
|
||||
|
||||
void llvm::copyMetadataForAccess(Instruction &DestI, Instruction &SourceI) {
|
||||
void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) {
|
||||
SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
|
||||
SourceI.getAllMetadata(MD);
|
||||
MDBuilder MDB(DestI.getContext());
|
||||
Type *NewType = DestI.getType();
|
||||
|
||||
// Only needed for range metadata on loads.
|
||||
const DataLayout *DL = nullptr;
|
||||
const LoadInst *LSource = dyn_cast<LoadInst>(&SourceI);
|
||||
if (LSource)
|
||||
DL = &LSource->getDataLayout();
|
||||
|
||||
Source.getAllMetadata(MD);
|
||||
MDBuilder MDB(Dest.getContext());
|
||||
Type *NewType = Dest.getType();
|
||||
const DataLayout &DL = Source.getDataLayout();
|
||||
for (const auto &MDPair : MD) {
|
||||
unsigned ID = MDPair.first;
|
||||
MDNode *N = MDPair.second;
|
||||
|
||||
// Note, essentially every kind of metadata should be preserved here! This
|
||||
// routine is supposed to clone a load instruction changing *only its type*.
|
||||
// The only metadata it makes sense to drop is metadata which is invalidated
|
||||
// when the pointer type changes. This should essentially never be the case
|
||||
// in LLVM, but we explicitly switch over only known metadata to be
|
||||
// conservatively correct. If you are adding metadata to LLVM which pertains
|
||||
// to loads, you almost certainly want to add it here.
|
||||
switch (ID) {
|
||||
// Applies to both loads and stores as-is.
|
||||
case LLVMContext::MD_dbg:
|
||||
case LLVMContext::MD_tbaa:
|
||||
case LLVMContext::MD_prof:
|
||||
case LLVMContext::MD_fpmath:
|
||||
case LLVMContext::MD_tbaa_struct:
|
||||
case LLVMContext::MD_invariant_load:
|
||||
case LLVMContext::MD_alias_scope:
|
||||
case LLVMContext::MD_noalias:
|
||||
case LLVMContext::MD_nontemporal:
|
||||
case LLVMContext::MD_mem_parallel_loop_access:
|
||||
case LLVMContext::MD_access_group:
|
||||
case LLVMContext::MD_noundef:
|
||||
case LLVMContext::MD_noalias_addrspace:
|
||||
case LLVMContext::MD_mem_parallel_loop_access:
|
||||
DestI.setMetadata(ID, N);
|
||||
break;
|
||||
|
||||
// Load-only metadata.
|
||||
case LLVMContext::MD_fpmath:
|
||||
case LLVMContext::MD_invariant_load:
|
||||
if (isa<LoadInst>(DestI))
|
||||
DestI.setMetadata(ID, N);
|
||||
// All of these directly apply.
|
||||
Dest.setMetadata(ID, N);
|
||||
break;
|
||||
|
||||
case LLVMContext::MD_nonnull:
|
||||
if (auto *LDest = dyn_cast<LoadInst>(&DestI)) {
|
||||
if (LSource)
|
||||
copyNonnullMetadata(*LSource, N, *LDest);
|
||||
}
|
||||
copyNonnullMetadata(Source, N, Dest);
|
||||
break;
|
||||
|
||||
case LLVMContext::MD_align:
|
||||
case LLVMContext::MD_dereferenceable:
|
||||
case LLVMContext::MD_dereferenceable_or_null:
|
||||
// Applies to both loads and stores only if the new type is also a
|
||||
// pointer.
|
||||
// These only directly apply if the new type is also a pointer.
|
||||
if (NewType->isPointerTy())
|
||||
DestI.setMetadata(ID, N);
|
||||
Dest.setMetadata(ID, N);
|
||||
break;
|
||||
|
||||
case LLVMContext::MD_range:
|
||||
if (auto *LDest = dyn_cast<LoadInst>(&DestI)) {
|
||||
if (LSource && DL)
|
||||
copyRangeMetadata(*DL, *LSource, N, *LDest);
|
||||
}
|
||||
break;
|
||||
|
||||
case LLVMContext::MD_tbaa:
|
||||
if (isa<LoadInst>(DestI))
|
||||
DestI.setMetadata(ID, N);
|
||||
copyRangeMetadata(DL, Source, N, Dest);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -112,7 +112,6 @@
|
||||
#include <optional>
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
@ -269,6 +268,11 @@ private:
|
||||
/// isGuaranteedToTransferExecutionToSuccessor(I) == true.
|
||||
bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
|
||||
|
||||
/// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
|
||||
/// in the same BB with the same value for getUnderlyingObject() etc.
|
||||
bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
|
||||
ArrayRef<Instruction *> EqClass);
|
||||
|
||||
/// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
|
||||
/// where all instructions access a known, constant offset from the first
|
||||
/// instruction.
|
||||
@ -334,22 +338,12 @@ private:
|
||||
EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
|
||||
BasicBlock::iterator End);
|
||||
|
||||
/// Inserts a cast instruction to convert Inst to DstTy.
|
||||
Value *insertCast(Value *Val, Type *DstTy);
|
||||
|
||||
/// Partitions Instrs into "chains" where every instruction has a known
|
||||
/// constant offset from the first instr in the chain.
|
||||
///
|
||||
/// Postcondition: For all i, ret[i][0].second == 0, because the first instr
|
||||
/// in the chain is the leader, and an instr touches distance 0 from itself.
|
||||
std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
|
||||
|
||||
// Helpers for chain merging.
|
||||
std::optional<APInt> computeLeaderDelta(Instruction *I1, Instruction *I2);
|
||||
bool chainsOverlapAfterRebase(const Chain &A, const Chain &B,
|
||||
const APInt &Delta) const;
|
||||
static void rebaseChain(Chain &C, const APInt &Delta);
|
||||
void normalizeChainToType(Chain &C, Type *CastTy);
|
||||
};
|
||||
|
||||
class LoadStoreVectorizerLegacyPass : public FunctionPass {
|
||||
@ -431,20 +425,6 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
|
||||
return Changed ? PA : PreservedAnalyses::all();
|
||||
}
|
||||
|
||||
static const Value *getUnderlyingObject(const Value *Ptr) {
|
||||
const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
|
||||
if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
|
||||
// The select's themselves are distinct instructions even if they share
|
||||
// the same condition and evaluate to consecutive pointers for true and
|
||||
// false values of the condition. Therefore using the select's themselves
|
||||
// for grouping instructions would put consecutive accesses into different
|
||||
// lists and they won't be even checked for being consecutive, and won't
|
||||
// be vectorized.
|
||||
return Sel->getCondition();
|
||||
}
|
||||
return ObjPtr;
|
||||
}
|
||||
|
||||
bool Vectorizer::run() {
|
||||
bool Changed = false;
|
||||
// Break up the BB if there are any instrs which aren't guaranteed to transfer
|
||||
@ -488,88 +468,6 @@ bool Vectorizer::run() {
|
||||
return Changed;
|
||||
}
|
||||
|
||||
Value *Vectorizer::insertCast(Value *Val, Type *DstTy) {
|
||||
if (DL.getTypeSizeInBits(Val->getType()) == DL.getTypeSizeInBits(DstTy)) {
|
||||
return Builder.CreateBitOrPointerCast(Val, DstTy, Val->getName() + ".bc");
|
||||
}
|
||||
|
||||
// If the types are of different sizes and both are integers, we can use
|
||||
// zext or sext to cast.
|
||||
if (Val->getType()->isIntegerTy() && DstTy->isIntegerTy()) {
|
||||
if (DL.getTypeSizeInBits(Val->getType()) < DL.getTypeSizeInBits(DstTy)) {
|
||||
return Builder.CreateZExt(Val, DstTy, Val->getName() + ".bc");
|
||||
}
|
||||
return Builder.CreateTrunc(Val, DstTy, Val->getName() + ".bc");
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::optional<APInt> Vectorizer::computeLeaderDelta(Instruction *I1,
|
||||
Instruction *I2) {
|
||||
assert(((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
|
||||
(isa<StoreInst>(I1) && isa<StoreInst>(I2))) &&
|
||||
"computeLeaderDelta must be called with two load or two store "
|
||||
"instructions");
|
||||
Instruction *CtxInst = I1->comesBefore(I2) ? I2 : I1;
|
||||
const Value *Ptr1 = getLoadStorePointerOperand(I1);
|
||||
const Value *Ptr2 = getLoadStorePointerOperand(I2);
|
||||
return getConstantOffset(const_cast<Value *>(Ptr1), const_cast<Value *>(Ptr2),
|
||||
CtxInst);
|
||||
}
|
||||
|
||||
bool Vectorizer::chainsOverlapAfterRebase(const Chain &A, const Chain &B,
|
||||
const APInt &Delta) const {
|
||||
ConstantRange ARange(
|
||||
A.front().OffsetFromLeader,
|
||||
A.back().OffsetFromLeader +
|
||||
DL.getTypeStoreSize(getLoadStoreType(A.back().Inst)));
|
||||
ConstantRange BRange(
|
||||
B.front().OffsetFromLeader + Delta,
|
||||
B.back().OffsetFromLeader + Delta +
|
||||
DL.getTypeStoreSize(getLoadStoreType(B.back().Inst)));
|
||||
return !ARange.intersectWith(BRange).isEmptySet();
|
||||
}
|
||||
|
||||
void Vectorizer::rebaseChain(Chain &C, const APInt &Delta) {
|
||||
for (ChainElem &E : C)
|
||||
E.OffsetFromLeader += Delta;
|
||||
}
|
||||
|
||||
void Vectorizer::normalizeChainToType(Chain &C, Type *CastTy) {
|
||||
for (ChainElem &Elem : C) {
|
||||
Instruction *Inst = Elem.Inst;
|
||||
Type *OrigValTy = getLoadStoreType(Inst);
|
||||
if (OrigValTy == CastTy)
|
||||
continue;
|
||||
|
||||
if (auto *LI = dyn_cast<LoadInst>(Inst)) {
|
||||
Builder.SetInsertPoint(LI);
|
||||
LoadInst *NewLoad = Builder.CreateLoad(CastTy, LI->getPointerOperand(),
|
||||
LI->getName() + ".mut");
|
||||
copyMetadataForAccess(*NewLoad, *LI);
|
||||
Value *CastBack = insertCast(NewLoad, OrigValTy);
|
||||
if (!CastBack)
|
||||
llvm_unreachable("Failed to insert cast");
|
||||
LI->replaceAllUsesWith(CastBack);
|
||||
ToErase.emplace_back(LI);
|
||||
Elem.Inst = NewLoad;
|
||||
} else if (auto *SI = dyn_cast<StoreInst>(Inst)) {
|
||||
Builder.SetInsertPoint(SI);
|
||||
Value *CastVal = insertCast(SI->getValueOperand(), CastTy);
|
||||
if (!CastVal)
|
||||
llvm_unreachable("Failed to insert cast");
|
||||
StoreInst *NewStore =
|
||||
Builder.CreateStore(CastVal, SI->getPointerOperand());
|
||||
NewStore->setAlignment(SI->getAlign());
|
||||
NewStore->setVolatile(SI->isVolatile());
|
||||
copyMetadataForAccess(*NewStore, *SI);
|
||||
ToErase.emplace_back(SI);
|
||||
Elem.Inst = NewStore;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
|
||||
BasicBlock::iterator End) {
|
||||
LLVM_DEBUG({
|
||||
@ -582,120 +480,49 @@ bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
|
||||
});
|
||||
|
||||
bool Changed = false;
|
||||
SmallVector<Chain> ContiguousSubChains;
|
||||
|
||||
for (const auto &[EqClassKey, EqClass] :
|
||||
collectEquivalenceClasses(Begin, End)) {
|
||||
collectEquivalenceClasses(Begin, End))
|
||||
Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
|
||||
|
||||
LLVM_DEBUG({
|
||||
dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
|
||||
<< " keyed on " << EqClassKey << ":\n";
|
||||
for (Instruction *I : EqClass)
|
||||
dbgs() << " " << *I << "\n";
|
||||
});
|
||||
return Changed;
|
||||
}
|
||||
|
||||
for (Chain &C : gatherChains(EqClass)) {
|
||||
bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
|
||||
ArrayRef<Instruction *> EqClass) {
|
||||
bool Changed = false;
|
||||
|
||||
// Split up the chain into increasingly smaller chains, until we can
|
||||
// finally vectorize the chains.
|
||||
//
|
||||
// (Don't be scared by the depth of the loop nest here. These operations
|
||||
// are all at worst O(n lg n) in the number of instructions, and splitting
|
||||
// chains doesn't change the number of instrs. So the whole loop nest is
|
||||
// O(n lg n).)
|
||||
for (auto &C : splitChainByMayAliasInstrs(C)) {
|
||||
for (auto &C : splitChainByContiguity(C)) {
|
||||
ContiguousSubChains.emplace_back(C);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
LLVM_DEBUG({
|
||||
dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
|
||||
<< " keyed on " << EqClassKey << ":\n";
|
||||
for (Instruction *I : EqClass)
|
||||
dbgs() << " " << *I << "\n";
|
||||
});
|
||||
|
||||
// Merge chains in reverse order, so that the first chain is the largest.
|
||||
for (int I = ContiguousSubChains.size() - 1; I > 0; I--) {
|
||||
Chain &C1 = ContiguousSubChains[I - 1];
|
||||
Chain &C2 = ContiguousSubChains[I];
|
||||
std::vector<Chain> Chains = gatherChains(EqClass);
|
||||
LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
|
||||
<< " nontrivial chains.\n";);
|
||||
for (Chain &C : Chains)
|
||||
Changed |= runOnChain(C);
|
||||
return Changed;
|
||||
}
|
||||
|
||||
// If the scalar types of the chains are the same, we can merge them
|
||||
// without inserting any casts.
|
||||
if (getLoadStoreType(C1[0].Inst)->getScalarType() ==
|
||||
getLoadStoreType(C2[0].Inst)->getScalarType())
|
||||
continue;
|
||||
|
||||
const Value *C1Ptr = getLoadStorePointerOperand(C1[0].Inst);
|
||||
const Value *C2Ptr = getLoadStorePointerOperand(C2[0].Inst);
|
||||
unsigned AS1 = C1Ptr->getType()->getPointerAddressSpace();
|
||||
unsigned AS2 = C2Ptr->getType()->getPointerAddressSpace();
|
||||
bool C1IsLoad = isa<LoadInst>(C1[0].Inst);
|
||||
bool C2IsLoad = isa<LoadInst>(C2[0].Inst);
|
||||
|
||||
// If the chains are mapped to different types, have distinct underlying
|
||||
// pointer objects, or include both loads and stores, skip.
|
||||
if (C1IsLoad != C2IsLoad || AS1 != AS2 ||
|
||||
::getUnderlyingObject(C1Ptr) != ::getUnderlyingObject(C2Ptr))
|
||||
continue;
|
||||
|
||||
// Compute constant offset between chain leaders; if unknown, skip.
|
||||
std::optional<APInt> DeltaOpt = computeLeaderDelta(C1[0].Inst, C2[0].Inst);
|
||||
if (!DeltaOpt)
|
||||
continue;
|
||||
|
||||
// Check that rebasing C2 into C1's coordinate space will not overlap C1.
|
||||
if (chainsOverlapAfterRebase(C1, C2, *DeltaOpt))
|
||||
continue;
|
||||
|
||||
// Determine the common integer cast type for normalization and ensure total
|
||||
// bitwidth matches across all elements of both chains.
|
||||
Type *C1ElemTy = getLoadStoreType(C1[0].Inst);
|
||||
unsigned TotalBits = DL.getTypeSizeInBits(C1ElemTy);
|
||||
auto AllElemsMatchTotalBits = [&](const Chain &C) {
|
||||
return llvm::all_of(C, [&](const ChainElem &E) {
|
||||
return DL.getTypeSizeInBits(getLoadStoreType(E.Inst)) == TotalBits;
|
||||
});
|
||||
};
|
||||
if (!AllElemsMatchTotalBits(C1) || !AllElemsMatchTotalBits(C2))
|
||||
continue;
|
||||
|
||||
// Power-of-two span ensures we can form a legal, single vector access
|
||||
// without padding or splitting. Many targets and cost models assume POT
|
||||
// widths, and it guarantees an integral element count for the chosen
|
||||
// VecElemTy.
|
||||
APInt Sz = C2.front().OffsetFromLeader +
|
||||
DL.getTypeStoreSize(getLoadStoreType(C2.front().Inst)) -
|
||||
C1.back().OffsetFromLeader + *DeltaOpt;
|
||||
if (!Sz.isPowerOf2())
|
||||
continue;
|
||||
|
||||
// Rebase C2's offsets into C1's coordinate space prior to merging and
|
||||
// merge C2 into C1 by appending all elements of C2 to C1, then erase C2
|
||||
// from ContiguousSubChains.
|
||||
rebaseChain(C2, *DeltaOpt);
|
||||
C1.insert(C1.end(), C2.begin(), C2.end());
|
||||
ContiguousSubChains.erase(ContiguousSubChains.begin() + I);
|
||||
|
||||
// Normalize the value operand/result type of each instruction in C1 to
|
||||
// C1CastTy.
|
||||
Type *C1CastTy =
|
||||
Type::getIntNTy(C1ElemTy->getContext(), DL.getTypeSizeInBits(C1ElemTy));
|
||||
normalizeChainToType(C1, C1CastTy);
|
||||
}
|
||||
|
||||
for (auto &C : ContiguousSubChains) {
|
||||
if (C.size() <= 1)
|
||||
continue;
|
||||
for (auto &AlignedSubChain : splitChainByAlignment(C))
|
||||
Changed |= vectorizeChain(AlignedSubChain);
|
||||
}
|
||||
|
||||
// Erase all instructions scheduled for deletion in this pseudo-BB.
|
||||
for (Instruction *I : ToErase) {
|
||||
auto *PtrOperand = getLoadStorePointerOperand(I);
|
||||
if (I->use_empty())
|
||||
I->eraseFromParent();
|
||||
RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
|
||||
}
|
||||
ToErase.clear();
|
||||
bool Vectorizer::runOnChain(Chain &C) {
|
||||
LLVM_DEBUG({
|
||||
dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
|
||||
dumpChain(C);
|
||||
});
|
||||
|
||||
// Split up the chain into increasingly smaller chains, until we can finally
|
||||
// vectorize the chains.
|
||||
//
|
||||
// (Don't be scared by the depth of the loop nest here. These operations are
|
||||
// all at worst O(n lg n) in the number of instructions, and splitting chains
|
||||
// doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
|
||||
bool Changed = false;
|
||||
for (auto &C : splitChainByMayAliasInstrs(C))
|
||||
for (auto &C : splitChainByContiguity(C))
|
||||
for (auto &C : splitChainByAlignment(C))
|
||||
Changed |= vectorizeChain(C);
|
||||
return Changed;
|
||||
}
|
||||
|
||||
@ -756,7 +583,7 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
|
||||
LLVM_DEBUG(
|
||||
dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
|
||||
<< *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
|
||||
if (!NewChain.empty()) {
|
||||
if (NewChain.size() > 1) {
|
||||
LLVM_DEBUG({
|
||||
dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
|
||||
dumpChain(NewChain);
|
||||
@ -768,7 +595,7 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
|
||||
NewChain = SmallVector<ChainElem, 1>({*ChainIt});
|
||||
}
|
||||
}
|
||||
if (!NewChain.empty()) {
|
||||
if (NewChain.size() > 1) {
|
||||
LLVM_DEBUG({
|
||||
dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
|
||||
dumpChain(NewChain);
|
||||
@ -833,6 +660,8 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
|
||||
PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
|
||||
}
|
||||
|
||||
// Filter out length-1 chains, these are uninteresting.
|
||||
llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
|
||||
return Ret;
|
||||
}
|
||||
|
||||
@ -852,7 +681,7 @@ Type *Vectorizer::getChainElemTy(const Chain &C) {
|
||||
if (any_of(C, [](const ChainElem &E) {
|
||||
return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
|
||||
})) {
|
||||
return IntegerType::getIntNTy(
|
||||
return Type::getIntNTy(
|
||||
F.getContext(),
|
||||
DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
|
||||
}
|
||||
@ -1640,6 +1469,20 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
|
||||
BasicBlock::iterator End) {
|
||||
EquivalenceClassMap Ret;
|
||||
|
||||
auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
|
||||
const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
|
||||
if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
|
||||
// The select's themselves are distinct instructions even if they share
|
||||
// the same condition and evaluate to consecutive pointers for true and
|
||||
// false values of the condition. Therefore using the select's themselves
|
||||
// for grouping instructions would put consecutive accesses into different
|
||||
// lists and they won't be even checked for being consecutive, and won't
|
||||
// be vectorized.
|
||||
return Sel->getCondition();
|
||||
}
|
||||
return ObjPtr;
|
||||
};
|
||||
|
||||
for (Instruction &I : make_range(Begin, End)) {
|
||||
auto *LI = dyn_cast<LoadInst>(&I);
|
||||
auto *SI = dyn_cast<StoreInst>(&I);
|
||||
@ -1687,7 +1530,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
|
||||
(VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
|
||||
continue;
|
||||
|
||||
Ret[{::getUnderlyingObject(Ptr), AS,
|
||||
Ret[{GetUnderlyingObject(Ptr), AS,
|
||||
DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
|
||||
/*IsLoad=*/LI != nullptr}]
|
||||
.emplace_back(&I);
|
||||
@ -1782,7 +1625,8 @@ std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
|
||||
Ret.reserve(Chains.size());
|
||||
// Iterate over MRU rather than Chains so the order is deterministic.
|
||||
for (auto &E : MRU)
|
||||
Ret.emplace_back(std::move(E.second));
|
||||
if (E.second.size() > 1)
|
||||
Ret.emplace_back(std::move(E.second));
|
||||
return Ret;
|
||||
}
|
||||
|
||||
|
||||
@ -20,5 +20,3 @@ define void @addi32(i32 %arg1, i32 %arg2) {
|
||||
store i32 %res, ptr addrspace(1) poison
|
||||
ret void
|
||||
}
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; CHECK: {{.*}}
|
||||
|
||||
@ -510,55 +510,53 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
|
||||
; GFX908-LABEL: introduced_copy_to_sgpr:
|
||||
; GFX908: ; %bb.0: ; %bb
|
||||
; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc
|
||||
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
|
||||
; GFX908-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
|
||||
; GFX908-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10
|
||||
; GFX908-NEXT: s_load_dword s0, s[8:9], 0x18
|
||||
; GFX908-NEXT: s_mov_b32 s12, 0
|
||||
; GFX908-NEXT: s_mov_b32 s9, s12
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
|
||||
; GFX908-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10
|
||||
; GFX908-NEXT: s_load_dword s5, s[8:9], 0x18
|
||||
; GFX908-NEXT: s_mov_b32 s4, 0
|
||||
; GFX908-NEXT: s_mov_b32 s9, s4
|
||||
; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1
|
||||
; GFX908-NEXT: s_sub_i32 s8, 0, s1
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s5
|
||||
; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s7
|
||||
; GFX908-NEXT: s_sub_i32 s1, 0, s7
|
||||
; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v17, 0
|
||||
; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
|
||||
; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX908-NEXT: v_readfirstlane_b32 s10, v0
|
||||
; GFX908-NEXT: s_mul_i32 s8, s8, s10
|
||||
; GFX908-NEXT: s_mul_hi_u32 s8, s10, s8
|
||||
; GFX908-NEXT: s_add_i32 s10, s10, s8
|
||||
; GFX908-NEXT: s_mul_hi_u32 s8, s0, s10
|
||||
; GFX908-NEXT: s_mul_i32 s10, s8, s1
|
||||
; GFX908-NEXT: s_sub_i32 s0, s0, s10
|
||||
; GFX908-NEXT: s_add_i32 s11, s8, 1
|
||||
; GFX908-NEXT: s_sub_i32 s10, s0, s1
|
||||
; GFX908-NEXT: s_cmp_ge_u32 s0, s1
|
||||
; GFX908-NEXT: s_cselect_b32 s8, s11, s8
|
||||
; GFX908-NEXT: s_cselect_b32 s0, s10, s0
|
||||
; GFX908-NEXT: s_add_i32 s10, s8, 1
|
||||
; GFX908-NEXT: s_cmp_ge_u32 s0, s1
|
||||
; GFX908-NEXT: s_cselect_b32 s8, s10, s8
|
||||
; GFX908-NEXT: s_lshr_b32 s5, s5, 16
|
||||
; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s5
|
||||
; GFX908-NEXT: s_lshl_b64 s[10:11], s[2:3], 5
|
||||
; GFX908-NEXT: s_lshl_b64 s[14:15], s[8:9], 5
|
||||
; GFX908-NEXT: s_lshl_b64 s[12:13], s[6:7], 5
|
||||
; GFX908-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX908-NEXT: s_mul_i32 s1, s1, s2
|
||||
; GFX908-NEXT: s_mul_hi_u32 s1, s2, s1
|
||||
; GFX908-NEXT: s_add_i32 s2, s2, s1
|
||||
; GFX908-NEXT: s_mul_hi_u32 s1, s6, s2
|
||||
; GFX908-NEXT: s_mul_i32 s2, s1, s7
|
||||
; GFX908-NEXT: s_sub_i32 s2, s6, s2
|
||||
; GFX908-NEXT: s_add_i32 s3, s1, 1
|
||||
; GFX908-NEXT: s_sub_i32 s6, s2, s7
|
||||
; GFX908-NEXT: s_cmp_ge_u32 s2, s7
|
||||
; GFX908-NEXT: s_cselect_b32 s1, s3, s1
|
||||
; GFX908-NEXT: s_cselect_b32 s2, s6, s2
|
||||
; GFX908-NEXT: s_add_i32 s3, s1, 1
|
||||
; GFX908-NEXT: s_cmp_ge_u32 s2, s7
|
||||
; GFX908-NEXT: s_cselect_b32 s8, s3, s1
|
||||
; GFX908-NEXT: s_lshr_b32 s2, s0, 16
|
||||
; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s2
|
||||
; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5
|
||||
; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1]
|
||||
; GFX908-NEXT: s_or_b32 s12, s12, 28
|
||||
; GFX908-NEXT: s_or_b32 s14, s14, 28
|
||||
; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: v_readfirstlane_b32 s5, v16
|
||||
; GFX908-NEXT: s_and_b32 s5, 0xffff, s5
|
||||
; GFX908-NEXT: s_mul_i32 s3, s3, s5
|
||||
; GFX908-NEXT: s_mul_hi_u32 s9, s2, s5
|
||||
; GFX908-NEXT: s_mul_i32 s2, s2, s5
|
||||
; GFX908-NEXT: s_add_i32 s3, s9, s3
|
||||
; GFX908-NEXT: s_lshl_b64 s[16:17], s[2:3], 5
|
||||
; GFX908-NEXT: v_readfirstlane_b32 s2, v16
|
||||
; GFX908-NEXT: s_and_b32 s2, 0xffff, s2
|
||||
; GFX908-NEXT: s_mul_i32 s3, s5, s2
|
||||
; GFX908-NEXT: s_mul_hi_u32 s5, s4, s2
|
||||
; GFX908-NEXT: s_mul_i32 s2, s4, s2
|
||||
; GFX908-NEXT: s_add_i32 s3, s5, s3
|
||||
; GFX908-NEXT: s_lshl_b64 s[4:5], s[2:3], 5
|
||||
; GFX908-NEXT: s_branch .LBB3_2
|
||||
; GFX908-NEXT: .LBB3_1: ; %Flow21
|
||||
; GFX908-NEXT: .LBB3_1: ; %Flow20
|
||||
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
||||
; GFX908-NEXT: s_cbranch_vccz .LBB3_12
|
||||
@ -571,47 +569,47 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
|
||||
; GFX908-NEXT: ; %bb.3: ; %bb14
|
||||
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
|
||||
; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
|
||||
; GFX908-NEXT: s_mov_b32 s5, s4
|
||||
; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
|
||||
; GFX908-NEXT: s_mov_b32 s13, s12
|
||||
; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
|
||||
; GFX908-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX908-NEXT: v_mov_b32_e32 v4, s12
|
||||
; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6
|
||||
; GFX908-NEXT: v_mov_b32_e32 v7, s5
|
||||
; GFX908-NEXT: v_mov_b32_e32 v9, s5
|
||||
; GFX908-NEXT: v_mov_b32_e32 v5, s5
|
||||
; GFX908-NEXT: v_mov_b32_e32 v6, s4
|
||||
; GFX908-NEXT: v_mov_b32_e32 v8, s4
|
||||
; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[6:7], 0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v6, s12
|
||||
; GFX908-NEXT: v_mov_b32_e32 v8, s12
|
||||
; GFX908-NEXT: v_mov_b32_e32 v5, s13
|
||||
; GFX908-NEXT: v_mov_b32_e32 v7, s13
|
||||
; GFX908-NEXT: v_mov_b32_e32 v9, s13
|
||||
; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v11, v5
|
||||
; GFX908-NEXT: s_mov_b64 s[20:21], s[12:13]
|
||||
; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15]
|
||||
; GFX908-NEXT: v_mov_b32_e32 v10, v4
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: v_readfirstlane_b32 s5, v2
|
||||
; GFX908-NEXT: v_readfirstlane_b32 s9, v3
|
||||
; GFX908-NEXT: s_add_u32 s5, s5, 1
|
||||
; GFX908-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX908-NEXT: s_mul_hi_u32 s22, s10, s5
|
||||
; GFX908-NEXT: s_mul_i32 s9, s10, s9
|
||||
; GFX908-NEXT: s_mul_i32 s23, s11, s5
|
||||
; GFX908-NEXT: s_add_i32 s9, s22, s9
|
||||
; GFX908-NEXT: s_mul_i32 s5, s10, s5
|
||||
; GFX908-NEXT: s_add_i32 s9, s9, s23
|
||||
; GFX908-NEXT: v_readfirstlane_b32 s9, v2
|
||||
; GFX908-NEXT: v_readfirstlane_b32 s13, v3
|
||||
; GFX908-NEXT: s_add_u32 s9, s9, 1
|
||||
; GFX908-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX908-NEXT: s_mul_hi_u32 s22, s6, s9
|
||||
; GFX908-NEXT: s_mul_i32 s13, s6, s13
|
||||
; GFX908-NEXT: s_mul_i32 s23, s7, s9
|
||||
; GFX908-NEXT: s_add_i32 s13, s22, s13
|
||||
; GFX908-NEXT: s_mul_i32 s9, s6, s9
|
||||
; GFX908-NEXT: s_add_i32 s13, s13, s23
|
||||
; GFX908-NEXT: s_branch .LBB3_5
|
||||
; GFX908-NEXT: .LBB3_4: ; %bb58
|
||||
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
|
||||
; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; GFX908-NEXT: s_add_u32 s20, s20, s16
|
||||
; GFX908-NEXT: s_add_u32 s20, s20, s4
|
||||
; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
|
||||
; GFX908-NEXT: s_addc_u32 s21, s21, s17
|
||||
; GFX908-NEXT: s_addc_u32 s21, s21, s5
|
||||
; GFX908-NEXT: s_mov_b64 s[22:23], 0
|
||||
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25]
|
||||
; GFX908-NEXT: s_cbranch_vccz .LBB3_9
|
||||
; GFX908-NEXT: .LBB3_5: ; %bb16
|
||||
; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1
|
||||
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; GFX908-NEXT: s_add_u32 s22, s20, s5
|
||||
; GFX908-NEXT: s_addc_u32 s23, s21, s9
|
||||
; GFX908-NEXT: s_add_u32 s22, s20, s9
|
||||
; GFX908-NEXT: s_addc_u32 s23, s21, s13
|
||||
; GFX908-NEXT: global_load_dword v21, v17, s[22:23] offset:-12 glc
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:-8 glc
|
||||
@ -659,17 +657,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
|
||||
; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard
|
||||
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||
; GFX908-NEXT: s_xor_b64 s[18:19], s[22:23], -1
|
||||
; GFX908-NEXT: .LBB3_10: ; %Flow20
|
||||
; GFX908-NEXT: .LBB3_10: ; %Flow19
|
||||
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||
; GFX908-NEXT: s_mov_b64 s[2:3], -1
|
||||
; GFX908-NEXT: s_and_b64 vcc, exec, s[18:19]
|
||||
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
|
||||
; GFX908-NEXT: ; %bb.11: ; %bb12
|
||||
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||
; GFX908-NEXT: s_add_u32 s6, s6, s8
|
||||
; GFX908-NEXT: s_addc_u32 s7, s7, 0
|
||||
; GFX908-NEXT: s_add_u32 s12, s12, s14
|
||||
; GFX908-NEXT: s_addc_u32 s13, s13, s15
|
||||
; GFX908-NEXT: s_add_u32 s10, s10, s8
|
||||
; GFX908-NEXT: s_addc_u32 s11, s11, 0
|
||||
; GFX908-NEXT: s_add_u32 s14, s14, s16
|
||||
; GFX908-NEXT: s_addc_u32 s15, s15, s17
|
||||
; GFX908-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX908-NEXT: s_branch .LBB3_1
|
||||
; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock
|
||||
@ -678,54 +676,52 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
|
||||
; GFX90A-LABEL: introduced_copy_to_sgpr:
|
||||
; GFX90A: ; %bb.0: ; %bb
|
||||
; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc
|
||||
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
|
||||
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10
|
||||
; GFX90A-NEXT: s_load_dword s0, s[8:9], 0x18
|
||||
; GFX90A-NEXT: s_mov_b32 s12, 0
|
||||
; GFX90A-NEXT: s_mov_b32 s9, s12
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10
|
||||
; GFX90A-NEXT: s_load_dword s5, s[8:9], 0x18
|
||||
; GFX90A-NEXT: s_mov_b32 s4, 0
|
||||
; GFX90A-NEXT: s_mov_b32 s9, s4
|
||||
; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1
|
||||
; GFX90A-NEXT: s_sub_i32 s8, 0, s1
|
||||
; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7
|
||||
; GFX90A-NEXT: s_sub_i32 s1, 0, s7
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v19, 0
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0
|
||||
; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
|
||||
; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s5
|
||||
; GFX90A-NEXT: v_readfirstlane_b32 s10, v1
|
||||
; GFX90A-NEXT: s_mul_i32 s8, s8, s10
|
||||
; GFX90A-NEXT: s_mul_hi_u32 s8, s10, s8
|
||||
; GFX90A-NEXT: s_add_i32 s10, s10, s8
|
||||
; GFX90A-NEXT: s_mul_hi_u32 s8, s0, s10
|
||||
; GFX90A-NEXT: s_mul_i32 s10, s8, s1
|
||||
; GFX90A-NEXT: s_sub_i32 s0, s0, s10
|
||||
; GFX90A-NEXT: s_add_i32 s11, s8, 1
|
||||
; GFX90A-NEXT: s_sub_i32 s10, s0, s1
|
||||
; GFX90A-NEXT: s_cmp_ge_u32 s0, s1
|
||||
; GFX90A-NEXT: s_cselect_b32 s8, s11, s8
|
||||
; GFX90A-NEXT: s_cselect_b32 s0, s10, s0
|
||||
; GFX90A-NEXT: s_add_i32 s10, s8, 1
|
||||
; GFX90A-NEXT: s_cmp_ge_u32 s0, s1
|
||||
; GFX90A-NEXT: s_cselect_b32 s8, s10, s8
|
||||
; GFX90A-NEXT: s_lshr_b32 s5, s5, 16
|
||||
; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s5
|
||||
; GFX90A-NEXT: s_lshl_b64 s[10:11], s[2:3], 5
|
||||
; GFX90A-NEXT: s_lshl_b64 s[14:15], s[8:9], 5
|
||||
; GFX90A-NEXT: s_lshl_b64 s[12:13], s[6:7], 5
|
||||
; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s0
|
||||
; GFX90A-NEXT: v_readfirstlane_b32 s2, v1
|
||||
; GFX90A-NEXT: s_mul_i32 s1, s1, s2
|
||||
; GFX90A-NEXT: s_mul_hi_u32 s1, s2, s1
|
||||
; GFX90A-NEXT: s_add_i32 s2, s2, s1
|
||||
; GFX90A-NEXT: s_mul_hi_u32 s1, s6, s2
|
||||
; GFX90A-NEXT: s_mul_i32 s2, s1, s7
|
||||
; GFX90A-NEXT: s_sub_i32 s2, s6, s2
|
||||
; GFX90A-NEXT: s_add_i32 s3, s1, 1
|
||||
; GFX90A-NEXT: s_sub_i32 s6, s2, s7
|
||||
; GFX90A-NEXT: s_cmp_ge_u32 s2, s7
|
||||
; GFX90A-NEXT: s_cselect_b32 s1, s3, s1
|
||||
; GFX90A-NEXT: s_cselect_b32 s2, s6, s2
|
||||
; GFX90A-NEXT: s_add_i32 s3, s1, 1
|
||||
; GFX90A-NEXT: s_cmp_ge_u32 s2, s7
|
||||
; GFX90A-NEXT: s_cselect_b32 s8, s3, s1
|
||||
; GFX90A-NEXT: s_lshr_b32 s2, s0, 16
|
||||
; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s2
|
||||
; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5
|
||||
; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5
|
||||
; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1]
|
||||
; GFX90A-NEXT: s_or_b32 s12, s12, 28
|
||||
; GFX90A-NEXT: s_or_b32 s14, s14, 28
|
||||
; GFX90A-NEXT: s_lshl_b64 s[16:17], s[8:9], 5
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_readfirstlane_b32 s5, v18
|
||||
; GFX90A-NEXT: s_and_b32 s5, 0xffff, s5
|
||||
; GFX90A-NEXT: s_mul_i32 s3, s3, s5
|
||||
; GFX90A-NEXT: s_mul_hi_u32 s9, s2, s5
|
||||
; GFX90A-NEXT: s_mul_i32 s2, s2, s5
|
||||
; GFX90A-NEXT: s_add_i32 s3, s9, s3
|
||||
; GFX90A-NEXT: s_lshl_b64 s[16:17], s[2:3], 5
|
||||
; GFX90A-NEXT: v_readfirstlane_b32 s2, v18
|
||||
; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2
|
||||
; GFX90A-NEXT: s_mul_i32 s3, s5, s2
|
||||
; GFX90A-NEXT: s_mul_hi_u32 s5, s4, s2
|
||||
; GFX90A-NEXT: s_mul_i32 s2, s4, s2
|
||||
; GFX90A-NEXT: s_add_i32 s3, s5, s3
|
||||
; GFX90A-NEXT: s_lshl_b64 s[4:5], s[2:3], 5
|
||||
; GFX90A-NEXT: s_branch .LBB3_2
|
||||
; GFX90A-NEXT: .LBB3_1: ; %Flow21
|
||||
; GFX90A-NEXT: .LBB3_1: ; %Flow20
|
||||
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_vccz .LBB3_12
|
||||
@ -738,34 +734,34 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
|
||||
; GFX90A-NEXT: ; %bb.3: ; %bb14
|
||||
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
|
||||
; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
|
||||
; GFX90A-NEXT: s_mov_b32 s5, s4
|
||||
; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
|
||||
; GFX90A-NEXT: s_mov_b32 s13, s12
|
||||
; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v8
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[4:5], s[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[4:5], s[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[6:7], 0
|
||||
; GFX90A-NEXT: s_mov_b64 s[20:21], s[12:13]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
|
||||
; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_readfirstlane_b32 s5, v4
|
||||
; GFX90A-NEXT: v_readfirstlane_b32 s9, v5
|
||||
; GFX90A-NEXT: s_add_u32 s5, s5, 1
|
||||
; GFX90A-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX90A-NEXT: s_mul_hi_u32 s22, s10, s5
|
||||
; GFX90A-NEXT: s_mul_i32 s9, s10, s9
|
||||
; GFX90A-NEXT: s_mul_i32 s23, s11, s5
|
||||
; GFX90A-NEXT: s_add_i32 s9, s22, s9
|
||||
; GFX90A-NEXT: s_mul_i32 s5, s10, s5
|
||||
; GFX90A-NEXT: s_add_i32 s9, s9, s23
|
||||
; GFX90A-NEXT: v_readfirstlane_b32 s9, v4
|
||||
; GFX90A-NEXT: v_readfirstlane_b32 s13, v5
|
||||
; GFX90A-NEXT: s_add_u32 s9, s9, 1
|
||||
; GFX90A-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX90A-NEXT: s_mul_hi_u32 s22, s6, s9
|
||||
; GFX90A-NEXT: s_mul_i32 s13, s6, s13
|
||||
; GFX90A-NEXT: s_mul_i32 s23, s7, s9
|
||||
; GFX90A-NEXT: s_add_i32 s13, s22, s13
|
||||
; GFX90A-NEXT: s_mul_i32 s9, s6, s9
|
||||
; GFX90A-NEXT: s_add_i32 s13, s13, s23
|
||||
; GFX90A-NEXT: s_branch .LBB3_5
|
||||
; GFX90A-NEXT: .LBB3_4: ; %bb58
|
||||
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
|
||||
; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; GFX90A-NEXT: s_add_u32 s20, s20, s16
|
||||
; GFX90A-NEXT: s_addc_u32 s21, s21, s17
|
||||
; GFX90A-NEXT: s_add_u32 s20, s20, s4
|
||||
; GFX90A-NEXT: s_addc_u32 s21, s21, s5
|
||||
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
|
||||
; GFX90A-NEXT: s_mov_b64 s[22:23], 0
|
||||
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25]
|
||||
@ -773,8 +769,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
|
||||
; GFX90A-NEXT: .LBB3_5: ; %bb16
|
||||
; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1
|
||||
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; GFX90A-NEXT: s_add_u32 s22, s20, s5
|
||||
; GFX90A-NEXT: s_addc_u32 s23, s21, s9
|
||||
; GFX90A-NEXT: s_add_u32 s22, s20, s9
|
||||
; GFX90A-NEXT: s_addc_u32 s23, s21, s13
|
||||
; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc
|
||||
@ -815,17 +811,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
|
||||
; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard
|
||||
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||
; GFX90A-NEXT: s_xor_b64 s[18:19], s[22:23], -1
|
||||
; GFX90A-NEXT: .LBB3_10: ; %Flow20
|
||||
; GFX90A-NEXT: .LBB3_10: ; %Flow19
|
||||
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], -1
|
||||
; GFX90A-NEXT: s_and_b64 vcc, exec, s[18:19]
|
||||
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
|
||||
; GFX90A-NEXT: ; %bb.11: ; %bb12
|
||||
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||
; GFX90A-NEXT: s_add_u32 s6, s6, s8
|
||||
; GFX90A-NEXT: s_addc_u32 s7, s7, 0
|
||||
; GFX90A-NEXT: s_add_u32 s12, s12, s14
|
||||
; GFX90A-NEXT: s_addc_u32 s13, s13, s15
|
||||
; GFX90A-NEXT: s_add_u32 s10, s10, s8
|
||||
; GFX90A-NEXT: s_addc_u32 s11, s11, 0
|
||||
; GFX90A-NEXT: s_add_u32 s14, s14, s16
|
||||
; GFX90A-NEXT: s_addc_u32 s15, s15, s17
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: s_branch .LBB3_1
|
||||
; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -257,29 +257,28 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
|
||||
; GFX6-LABEL: build_v2i32_from_v4i16_shuffle:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_mov_b32 s4, s0
|
||||
; GFX6-NEXT: s_mov_b32 s5, s1
|
||||
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
|
||||
; GFX6-NEXT: s_lshl_b32 s1, s2, 16
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; GFX6-NEXT: s_lshl_b32 s5, s5, 16
|
||||
; GFX6-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: build_v2i32_from_v4i16_shuffle:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s3, 16
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s2, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: build_v2i32_from_v4i16_shuffle:
|
||||
|
||||
@ -218,13 +218,13 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
|
||||
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: s_and_b32 s0, s3, 0x7fff7fff
|
||||
; CI-NEXT: s_and_b32 s1, s2, 0x7fff7fff
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
|
||||
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: s_fabs_v4bf16:
|
||||
@ -234,13 +234,23 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
|
||||
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_and_b32 s0, s3, 0x7fff7fff
|
||||
; VI-NEXT: s_and_b32 s1, s2, 0x7fff7fff
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_and_b32 s4, s3, 0x7fff
|
||||
; VI-NEXT: s_lshr_b32 s3, s3, 16
|
||||
; VI-NEXT: s_and_b32 s5, s2, 0x7fff
|
||||
; VI-NEXT: s_lshr_b32 s2, s2, 16
|
||||
; VI-NEXT: s_and_b32 s3, s3, 0x7fff
|
||||
; VI-NEXT: s_and_b32 s2, s2, 0x7fff
|
||||
; VI-NEXT: s_and_b32 s4, 0xffff, s4
|
||||
; VI-NEXT: s_and_b32 s5, 0xffff, s5
|
||||
; VI-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; VI-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; VI-NEXT: s_or_b32 s3, s4, s3
|
||||
; VI-NEXT: s_or_b32 s2, s5, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: s_fabs_v4bf16:
|
||||
@ -248,8 +258,14 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
|
||||
; GFX9-NEXT: s_and_b32 s4, s3, 0x7fff
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s3, 16
|
||||
; GFX9-NEXT: s_and_b32 s5, s2, 0x7fff
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff
|
||||
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3
|
||||
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
@ -259,8 +275,14 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
|
||||
; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff
|
||||
; GFX11-NEXT: s_and_b32 s4, s3, 0x7fff
|
||||
; GFX11-NEXT: s_lshr_b32 s3, s3, 16
|
||||
; GFX11-NEXT: s_lshr_b32 s5, s2, 16
|
||||
; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
|
||||
; GFX11-NEXT: s_and_b32 s5, s5, 0x7fff
|
||||
; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff
|
||||
; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s5
|
||||
; GFX11-NEXT: s_pack_ll_b32_b16 s3, s4, s3
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s2
|
||||
|
||||
@ -218,13 +218,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
|
||||
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: s_and_b32 s0, s3, 0x7fff7fff
|
||||
; CI-NEXT: s_and_b32 s1, s2, 0x7fff7fff
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
|
||||
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: s_fabs_v4f16:
|
||||
@ -234,13 +234,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
|
||||
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_and_b32 s0, s3, 0x7fff7fff
|
||||
; VI-NEXT: s_and_b32 s1, s2, 0x7fff7fff
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
|
||||
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: s_fabs_v4f16:
|
||||
|
||||
@ -99,29 +99,28 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
|
||||
; SI-LABEL: fabs_v2f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_and_b32 s0, s3, 0x7fffffff
|
||||
; SI-NEXT: s_and_b32 s1, s2, 0x7fffffff
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_bitset0_b32 s5, 31
|
||||
; SI-NEXT: s_bitset0_b32 s4, 31
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fabs_v2f32:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff
|
||||
; VI-NEXT: s_and_b32 s1, s2, 0x7fffffff
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_bitset0_b32 s3, 31
|
||||
; VI-NEXT: s_bitset0_b32 s2, 31
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
|
||||
store <2 x float> %fabs, ptr addrspace(1) %out
|
||||
|
||||
@ -472,52 +472,50 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out,
|
||||
define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) {
|
||||
; SI-LABEL: s_test_copysign_v2f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; SI-NEXT: s_brev_b32 s8, -2
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_brev_b32 s0, -2
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SI-NEXT: v_bfi_b32 v1, s0, v0, v1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s8
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_bfi_b32 v0, s0, v0, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; SI-NEXT: v_bfi_b32 v1, s8, v0, v1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; SI-NEXT: v_bfi_b32 v0, s8, v0, v2
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: s_test_copysign_v2f32:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; VI-NEXT: s_brev_b32 s6, -2
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: v_bfi_b32 v1, s6, v0, v1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_bfi_b32 v0, s6, v2, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: v_bfi_b32 v3, s6, v2, v3
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_bfi_b32 v2, s6, v2, v4
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: s_test_copysign_v2f32:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0
|
||||
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v2
|
||||
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0
|
||||
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v2
|
||||
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
%result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
|
||||
store <2 x float> %result, ptr addrspace(1) %out, align 8
|
||||
|
||||
@ -932,18 +932,16 @@ entry:
|
||||
define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
|
||||
; GFX6-FASTFMA-LABEL: s_fdiv_v2f32:
|
||||
; GFX6-FASTFMA: ; %bb.0: ; %entry
|
||||
; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
||||
; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
|
||||
; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-FASTFMA-NEXT: s_mov_b32 s4, s0
|
||||
; GFX6-FASTFMA-NEXT: s_mov_b32 s5, s1
|
||||
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[0:1], s9, s9, v1
|
||||
; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], s11, s11, v1
|
||||
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2
|
||||
; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s3, v0, s3
|
||||
; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s9, v0, s9
|
||||
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
|
||||
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0
|
||||
; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3
|
||||
@ -952,13 +950,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
|
||||
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v3, v4
|
||||
; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v2, v4, v0
|
||||
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
|
||||
; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v3, v4
|
||||
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[0:1], s8, s8, v2
|
||||
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], s10, s10, v2
|
||||
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3
|
||||
; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s9, v1
|
||||
; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2
|
||||
; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s11, v1
|
||||
; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8
|
||||
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
|
||||
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0
|
||||
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4
|
||||
@ -968,21 +966,20 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
|
||||
; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v3, v5, v0
|
||||
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
|
||||
; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v4, v5
|
||||
; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s8, v2
|
||||
; GFX6-FASTFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s10, v2
|
||||
; GFX6-FASTFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; GFX6-FASTFMA-NEXT: s_endpgm
|
||||
;
|
||||
; GFX6-SLOWFMA-LABEL: s_fdiv_v2f32:
|
||||
; GFX6-SLOWFMA: ; %bb.0: ; %entry
|
||||
; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
||||
; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[6:7], s5, s5, v0
|
||||
; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s3, v2, s3
|
||||
; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[6:7], s3, s3, v0
|
||||
; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s1, v2, s1
|
||||
; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1
|
||||
; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
|
||||
; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v3, 1.0
|
||||
@ -992,13 +989,14 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
|
||||
; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v3, v5
|
||||
; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v5, v2
|
||||
; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
|
||||
; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[6:7], s4, s4, v4
|
||||
; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[6:7], s2, s2, v4
|
||||
; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v5
|
||||
; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s2, v3, s2
|
||||
; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s0, v3, s0
|
||||
; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v2
|
||||
; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s5, v0
|
||||
; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s3, v0
|
||||
; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
|
||||
; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v2, v5, 1.0
|
||||
; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, v0, v5, v5
|
||||
@ -1008,24 +1006,22 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
|
||||
; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3
|
||||
; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
|
||||
; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v2, v0, v5
|
||||
; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, v4
|
||||
; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s2, v4
|
||||
; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-SLOWFMA-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: s_fdiv_v2f32:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
||||
; GFX7-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX7-NEXT: s_mov_b32 s6, -1
|
||||
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: s_mov_b32 s4, s0
|
||||
; GFX7-NEXT: s_mov_b32 s5, s1
|
||||
; GFX7-NEXT: v_div_scale_f32 v2, s[0:1], s9, s9, v1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], s11, s11, v1
|
||||
; GFX7-NEXT: v_rcp_f32_e32 v3, v2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GFX7-NEXT: v_div_scale_f32 v0, vcc, s3, v0, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GFX7-NEXT: v_div_scale_f32 v0, vcc, s9, v0, s9
|
||||
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
|
||||
; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0
|
||||
; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3
|
||||
@ -1034,13 +1030,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
|
||||
; GFX7-NEXT: v_fma_f32 v4, v5, v3, v4
|
||||
; GFX7-NEXT: v_fma_f32 v0, -v2, v4, v0
|
||||
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX7-NEXT: v_div_fmas_f32 v0, v0, v3, v4
|
||||
; GFX7-NEXT: v_div_scale_f32 v3, s[0:1], s8, s8, v2
|
||||
; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], s10, s10, v2
|
||||
; GFX7-NEXT: v_rcp_f32_e32 v4, v3
|
||||
; GFX7-NEXT: v_div_fixup_f32 v1, v0, s9, v1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX7-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2
|
||||
; GFX7-NEXT: v_div_fixup_f32 v1, v0, s11, v1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GFX7-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8
|
||||
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
|
||||
; GFX7-NEXT: v_fma_f32 v5, -v3, v4, 1.0
|
||||
; GFX7-NEXT: v_fma_f32 v4, v5, v4, v4
|
||||
@ -1050,20 +1046,19 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
|
||||
; GFX7-NEXT: v_fma_f32 v0, -v3, v5, v0
|
||||
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
|
||||
; GFX7-NEXT: v_div_fmas_f32 v0, v0, v4, v5
|
||||
; GFX7-NEXT: v_div_fixup_f32 v0, v0, s8, v2
|
||||
; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX7-NEXT: v_div_fixup_f32 v0, v0, s10, v2
|
||||
; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: s_fdiv_v2f32:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX8-NEXT: v_div_scale_f32 v1, s[6:7], s5, s5, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX8-NEXT: v_div_scale_f32 v2, vcc, s3, v2, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX8-NEXT: v_div_scale_f32 v1, s[6:7], s3, s3, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX8-NEXT: v_div_scale_f32 v2, vcc, s1, v2, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NEXT: v_rcp_f32_e32 v3, v1
|
||||
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
|
||||
; GFX8-NEXT: v_fma_f32 v5, -v1, v3, 1.0
|
||||
@ -1073,12 +1068,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
|
||||
; GFX8-NEXT: v_fma_f32 v5, v6, v3, v5
|
||||
; GFX8-NEXT: v_fma_f32 v1, -v1, v5, v2
|
||||
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
|
||||
; GFX8-NEXT: v_div_scale_f32 v2, s[6:7], s4, s4, v4
|
||||
; GFX8-NEXT: v_div_scale_f32 v2, s[6:7], s2, s2, v4
|
||||
; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX8-NEXT: v_div_scale_f32 v3, vcc, s2, v3, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX8-NEXT: v_div_scale_f32 v3, vcc, s0, v3, s0
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NEXT: v_rcp_f32_e32 v5, v2
|
||||
; GFX8-NEXT: v_div_fixup_f32 v1, v1, s5, v0
|
||||
; GFX8-NEXT: v_div_fixup_f32 v1, v1, s3, v0
|
||||
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
|
||||
; GFX8-NEXT: v_fma_f32 v0, -v2, v5, 1.0
|
||||
; GFX8-NEXT: v_fma_f32 v0, v0, v5, v5
|
||||
@ -1088,20 +1084,19 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
|
||||
; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3
|
||||
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
|
||||
; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-NEXT: v_div_fixup_f32 v0, v0, s4, v4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8-NEXT: v_div_fixup_f32 v0, v0, s2, v4
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: s_fdiv_v2f32:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_div_scale_f32 v0, s4, s7, s7, s3
|
||||
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s3, s7, s3
|
||||
; GFX10-NEXT: v_div_scale_f32 v0, s6, s3, s3, s1
|
||||
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s1, s3, s1
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
|
||||
; GFX10-NEXT: s_denorm_mode 15
|
||||
; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0
|
||||
@ -1111,11 +1106,12 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
|
||||
; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1
|
||||
; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2
|
||||
; GFX10-NEXT: s_denorm_mode 12
|
||||
; GFX10-NEXT: v_div_scale_f32 v2, s4, s6, s6, s2
|
||||
; GFX10-NEXT: v_div_scale_f32 v2, s6, s2, s2, s0
|
||||
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v3, v2
|
||||
; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s3
|
||||
; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s2, s6, s2
|
||||
; GFX10-NEXT: v_div_fixup_f32 v1, v0, s3, s1
|
||||
; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s2, s0
|
||||
; GFX10-NEXT: s_denorm_mode 15
|
||||
; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0
|
||||
; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3
|
||||
@ -1126,18 +1122,19 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
|
||||
; GFX10-NEXT: s_denorm_mode 12
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v3, v4
|
||||
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s6, s2
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, s0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: s_fdiv_v2f32:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_div_scale_f32 v0, null, s5, s5, s3
|
||||
; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s3, s5, s3
|
||||
; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s1
|
||||
; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s1, s3, s1
|
||||
; GFX11-NEXT: v_rcp_f32_e32 v1, v0
|
||||
; GFX11-NEXT: s_denorm_mode 15
|
||||
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
||||
@ -1148,11 +1145,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
|
||||
; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1
|
||||
; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2
|
||||
; GFX11-NEXT: s_denorm_mode 12
|
||||
; GFX11-NEXT: v_div_scale_f32 v2, null, s4, s4, s2
|
||||
; GFX11-NEXT: v_div_scale_f32 v2, null, s2, s2, s0
|
||||
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3
|
||||
; GFX11-NEXT: v_rcp_f32_e32 v3, v2
|
||||
; GFX11-NEXT: v_div_fixup_f32 v1, v0, s5, s3
|
||||
; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s2, s4, s2
|
||||
; GFX11-NEXT: v_div_fixup_f32 v1, v0, s3, s1
|
||||
; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s2, s0
|
||||
; GFX11-NEXT: s_denorm_mode 15
|
||||
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
||||
; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0
|
||||
@ -1164,8 +1161,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
|
||||
; GFX11-NEXT: s_denorm_mode 12
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v3, v4
|
||||
; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, s2
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, s0
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: s_fdiv_v2f32:
|
||||
@ -1190,60 +1187,58 @@ entry:
|
||||
define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
|
||||
; GFX67-LABEL: s_fdiv_ulp25_v2f32:
|
||||
; GFX67: ; %bb.0: ; %entry
|
||||
; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
||||
; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; GFX67-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX67-NEXT: s_mov_b32 s6, -1
|
||||
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX67-NEXT: v_rcp_f32_e32 v0, s8
|
||||
; GFX67-NEXT: v_rcp_f32_e32 v1, s9
|
||||
; GFX67-NEXT: s_mov_b32 s4, s0
|
||||
; GFX67-NEXT: s_mov_b32 s5, s1
|
||||
; GFX67-NEXT: v_mul_f32_e32 v0, s2, v0
|
||||
; GFX67-NEXT: v_mul_f32_e32 v1, s3, v1
|
||||
; GFX67-NEXT: v_rcp_f32_e32 v0, s2
|
||||
; GFX67-NEXT: v_rcp_f32_e32 v1, s3
|
||||
; GFX67-NEXT: v_mul_f32_e32 v0, s0, v0
|
||||
; GFX67-NEXT: v_mul_f32_e32 v1, s1, v1
|
||||
; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX67-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: s_fdiv_ulp25_v2f32:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_rcp_f32_e32 v2, s6
|
||||
; GFX8-NEXT: v_rcp_f32_e32 v3, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: v_mul_f32_e32 v2, s2, v2
|
||||
; GFX8-NEXT: v_mul_f32_e32 v3, s3, v3
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GFX8-NEXT: v_rcp_f32_e32 v0, s2
|
||||
; GFX8-NEXT: v_rcp_f32_e32 v1, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NEXT: v_mul_f32_e32 v0, s0, v0
|
||||
; GFX8-NEXT: v_mul_f32_e32 v1, s1, v1
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: s_fdiv_ulp25_v2f32:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v0, s6
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v1, s7
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v1, s3, v1
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v0, s2
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v1, s3
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, s0, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v1, s1, v1
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: s_fdiv_ulp25_v2f32:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_rcp_f32_e32 v0, s6
|
||||
; GFX11-NEXT: v_rcp_f32_e32 v1, s7
|
||||
; GFX11-NEXT: v_rcp_f32_e32 v0, s2
|
||||
; GFX11-NEXT: v_rcp_f32_e32 v1, s3
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
||||
; GFX11-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: s_fdiv_ulp25_v2f32:
|
||||
@ -1268,60 +1263,58 @@ entry:
|
||||
define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
|
||||
; GFX67-LABEL: s_fdiv_v2f32_fast_math:
|
||||
; GFX67: ; %bb.0: ; %entry
|
||||
; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
||||
; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; GFX67-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX67-NEXT: s_mov_b32 s6, -1
|
||||
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX67-NEXT: v_rcp_f32_e32 v0, s9
|
||||
; GFX67-NEXT: v_rcp_f32_e32 v2, s8
|
||||
; GFX67-NEXT: s_mov_b32 s4, s0
|
||||
; GFX67-NEXT: s_mov_b32 s5, s1
|
||||
; GFX67-NEXT: v_mul_f32_e32 v1, s3, v0
|
||||
; GFX67-NEXT: v_mul_f32_e32 v0, s2, v2
|
||||
; GFX67-NEXT: v_rcp_f32_e32 v0, s3
|
||||
; GFX67-NEXT: v_rcp_f32_e32 v2, s2
|
||||
; GFX67-NEXT: v_mul_f32_e32 v1, s1, v0
|
||||
; GFX67-NEXT: v_mul_f32_e32 v0, s0, v2
|
||||
; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX67-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: s_fdiv_v2f32_fast_math:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_rcp_f32_e32 v2, s7
|
||||
; GFX8-NEXT: v_rcp_f32_e32 v4, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: v_mul_f32_e32 v3, s3, v2
|
||||
; GFX8-NEXT: v_mul_f32_e32 v2, s2, v4
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GFX8-NEXT: v_rcp_f32_e32 v0, s3
|
||||
; GFX8-NEXT: v_rcp_f32_e32 v2, s2
|
||||
; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0
|
||||
; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: s_fdiv_v2f32_fast_math:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v0, s7
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v2, s6
|
||||
; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, s2, v2
|
||||
; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v0, s3
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v2, s2
|
||||
; GFX10-NEXT: v_mul_f32_e32 v1, s1, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, s0, v2
|
||||
; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: s_fdiv_v2f32_fast_math:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_rcp_f32_e32 v0, s7
|
||||
; GFX11-NEXT: v_rcp_f32_e32 v2, s6
|
||||
; GFX11-NEXT: v_rcp_f32_e32 v0, s3
|
||||
; GFX11-NEXT: v_rcp_f32_e32 v2, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
||||
; GFX11-NEXT: v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2
|
||||
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2
|
||||
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: s_fdiv_v2f32_fast_math:
|
||||
@ -1346,60 +1339,58 @@ entry:
|
||||
define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
|
||||
; GFX67-LABEL: s_fdiv_v2f32_arcp_math:
|
||||
; GFX67: ; %bb.0: ; %entry
|
||||
; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
||||
; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; GFX67-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX67-NEXT: s_mov_b32 s6, -1
|
||||
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX67-NEXT: v_rcp_f32_e32 v0, s9
|
||||
; GFX67-NEXT: v_rcp_f32_e32 v2, s8
|
||||
; GFX67-NEXT: s_mov_b32 s4, s0
|
||||
; GFX67-NEXT: s_mov_b32 s5, s1
|
||||
; GFX67-NEXT: v_mul_f32_e32 v1, s3, v0
|
||||
; GFX67-NEXT: v_mul_f32_e32 v0, s2, v2
|
||||
; GFX67-NEXT: v_rcp_f32_e32 v0, s3
|
||||
; GFX67-NEXT: v_rcp_f32_e32 v2, s2
|
||||
; GFX67-NEXT: v_mul_f32_e32 v1, s1, v0
|
||||
; GFX67-NEXT: v_mul_f32_e32 v0, s0, v2
|
||||
; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX67-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: s_fdiv_v2f32_arcp_math:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_rcp_f32_e32 v2, s7
|
||||
; GFX8-NEXT: v_rcp_f32_e32 v4, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: v_mul_f32_e32 v3, s3, v2
|
||||
; GFX8-NEXT: v_mul_f32_e32 v2, s2, v4
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GFX8-NEXT: v_rcp_f32_e32 v0, s3
|
||||
; GFX8-NEXT: v_rcp_f32_e32 v2, s2
|
||||
; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0
|
||||
; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: s_fdiv_v2f32_arcp_math:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v0, s7
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v2, s6
|
||||
; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, s2, v2
|
||||
; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v0, s3
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v2, s2
|
||||
; GFX10-NEXT: v_mul_f32_e32 v1, s1, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, s0, v2
|
||||
; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: s_fdiv_v2f32_arcp_math:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_rcp_f32_e32 v0, s7
|
||||
; GFX11-NEXT: v_rcp_f32_e32 v2, s6
|
||||
; GFX11-NEXT: v_rcp_f32_e32 v0, s3
|
||||
; GFX11-NEXT: v_rcp_f32_e32 v2, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
||||
; GFX11-NEXT: v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2
|
||||
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2
|
||||
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: s_fdiv_v2f32_arcp_math:
|
||||
|
||||
@ -121,25 +121,24 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
|
||||
; SICI-LABEL: fnearbyint_v2f32:
|
||||
; SICI: ; %bb.0: ; %entry
|
||||
; SICI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SICI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SICI-NEXT: s_mov_b32 s6, -1
|
||||
; SICI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SICI-NEXT: s_mov_b32 s4, s0
|
||||
; SICI-NEXT: s_mov_b32 s5, s1
|
||||
; SICI-NEXT: v_rndne_f32_e32 v1, s3
|
||||
; SICI-NEXT: v_rndne_f32_e32 v0, s2
|
||||
; SICI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SICI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SICI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SICI-NEXT: s_mov_b32 s2, -1
|
||||
; SICI-NEXT: v_rndne_f32_e32 v1, s5
|
||||
; SICI-NEXT: v_rndne_f32_e32 v0, s4
|
||||
; SICI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SICI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fnearbyint_v2f32:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_rndne_f32_e32 v3, s3
|
||||
; VI-NEXT: v_rndne_f32_e32 v2, s2
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_rndne_f32_e32 v1, s3
|
||||
; VI-NEXT: v_rndne_f32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: fnearbyint_v2f32:
|
||||
|
||||
@ -624,13 +624,13 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
|
||||
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: s_or_b32 s0, s3, 0x80008000
|
||||
; CI-NEXT: s_or_b32 s1, s2, 0x80008000
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; CI-NEXT: s_or_b32 s3, s3, 0x80008000
|
||||
; CI-NEXT: s_or_b32 s2, s2, 0x80008000
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fneg_fabs_v4bf16:
|
||||
@ -640,25 +640,23 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
|
||||
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_and_b32 s0, s2, 0x7fff7fff
|
||||
; VI-NEXT: s_and_b32 s1, s3, 0x7fff7fff
|
||||
; VI-NEXT: s_bfe_u32 s3, s3, 0xf0010
|
||||
; VI-NEXT: s_bfe_u32 s2, s2, 0xf0010
|
||||
; VI-NEXT: s_xor_b32 s1, s1, 0x8000
|
||||
; VI-NEXT: s_xor_b32 s3, s3, 0x8000
|
||||
; VI-NEXT: s_xor_b32 s0, s0, 0x8000
|
||||
; VI-NEXT: s_xor_b32 s2, s2, 0x8000
|
||||
; VI-NEXT: s_and_b32 s1, 0xffff, s1
|
||||
; VI-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; VI-NEXT: s_and_b32 s0, 0xffff, s0
|
||||
; VI-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; VI-NEXT: s_or_b32 s1, s1, s3
|
||||
; VI-NEXT: s_or_b32 s0, s0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: s_lshr_b32 s4, s2, 16
|
||||
; VI-NEXT: s_lshr_b32 s5, s3, 16
|
||||
; VI-NEXT: s_bitset1_b32 s3, 15
|
||||
; VI-NEXT: s_bitset1_b32 s2, 15
|
||||
; VI-NEXT: s_bitset1_b32 s5, 15
|
||||
; VI-NEXT: s_bitset1_b32 s4, 15
|
||||
; VI-NEXT: s_and_b32 s3, 0xffff, s3
|
||||
; VI-NEXT: s_lshl_b32 s5, s5, 16
|
||||
; VI-NEXT: s_and_b32 s2, 0xffff, s2
|
||||
; VI-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; VI-NEXT: s_or_b32 s3, s3, s5
|
||||
; VI-NEXT: s_or_b32 s2, s2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: fneg_fabs_v4bf16:
|
||||
@ -666,16 +664,14 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_and_b32 s4, s2, 0x7fff7fff
|
||||
; GFX9-NEXT: s_and_b32 s5, s3, 0x7fff7fff
|
||||
; GFX9-NEXT: s_bfe_u32 s3, s3, 0xf0010
|
||||
; GFX9-NEXT: s_bfe_u32 s2, s2, 0xf0010
|
||||
; GFX9-NEXT: s_xor_b32 s3, s3, 0x8000
|
||||
; GFX9-NEXT: s_xor_b32 s5, s5, 0x8000
|
||||
; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000
|
||||
; GFX9-NEXT: s_xor_b32 s4, s4, 0x8000
|
||||
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s5, s3
|
||||
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s4, s2, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s5, s3, 16
|
||||
; GFX9-NEXT: s_bitset1_b32 s3, 15
|
||||
; GFX9-NEXT: s_bitset1_b32 s2, 15
|
||||
; GFX9-NEXT: s_bitset1_b32 s5, 15
|
||||
; GFX9-NEXT: s_bitset1_b32 s4, 15
|
||||
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s5
|
||||
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
@ -685,16 +681,14 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_and_b32 s4, s2, 0x7fff7fff
|
||||
; GFX11-NEXT: s_and_b32 s5, s3, 0x7fff7fff
|
||||
; GFX11-NEXT: s_bfe_u32 s3, s3, 0xf0010
|
||||
; GFX11-NEXT: s_bfe_u32 s2, s2, 0xf0010
|
||||
; GFX11-NEXT: s_xor_b32 s3, s3, 0x8000
|
||||
; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000
|
||||
; GFX11-NEXT: s_xor_b32 s4, s4, 0x8000
|
||||
; GFX11-NEXT: s_xor_b32 s5, s5, 0x8000
|
||||
; GFX11-NEXT: s_pack_ll_b32_b16 s2, s4, s2
|
||||
; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s3
|
||||
; GFX11-NEXT: s_lshr_b32 s4, s2, 16
|
||||
; GFX11-NEXT: s_lshr_b32 s5, s3, 16
|
||||
; GFX11-NEXT: s_bitset1_b32 s3, 15
|
||||
; GFX11-NEXT: s_bitset1_b32 s2, 15
|
||||
; GFX11-NEXT: s_bitset1_b32 s4, 15
|
||||
; GFX11-NEXT: s_bitset1_b32 s5, 15
|
||||
; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
|
||||
; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s2
|
||||
|
||||
@ -516,13 +516,13 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
|
||||
; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CIVI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CIVI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CIVI-NEXT: s_or_b32 s0, s3, 0x80008000
|
||||
; CIVI-NEXT: s_or_b32 s1, s2, 0x80008000
|
||||
; CIVI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; CIVI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000
|
||||
; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000
|
||||
; CIVI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; CIVI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; CIVI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; CIVI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; CIVI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: fneg_fabs_v4f16:
|
||||
|
||||
@ -199,29 +199,28 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %
|
||||
; SI-LABEL: fneg_fabsf_v2f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_bitset1_b32 s3, 31
|
||||
; SI-NEXT: s_bitset1_b32 s2, 31
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_bitset1_b32 s5, 31
|
||||
; SI-NEXT: s_bitset1_b32 s4, 31
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fneg_fabsf_v2f32:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_or_b32 s0, s3, 0x80000000
|
||||
; VI-NEXT: s_or_b32 s1, s2, 0x80000000
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_bitset1_b32 s3, 31
|
||||
; VI-NEXT: s_bitset1_b32 s2, 31
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
|
||||
%fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
|
||||
|
||||
@ -52,29 +52,28 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl
|
||||
; SI-LABEL: s_fneg_v2f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_xor_b32 s0, s3, 0x80000000
|
||||
; SI-NEXT: s_xor_b32 s1, s2, 0x80000000
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: s_xor_b32 s5, s5, 0x80000000
|
||||
; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: s_fneg_v2f32:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_xor_b32 s0, s3, 0x80000000
|
||||
; VI-NEXT: s_xor_b32 s1, s2, 0x80000000
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_xor_b32 s3, s3, 0x80000000
|
||||
; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: s_fneg_v2f32:
|
||||
|
||||
@ -134,27 +134,24 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> %
|
||||
; SI-LABEL: fp_to_sint_v2i32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_cvt_i32_f32_e32 v1, s3
|
||||
; SI-NEXT: v_cvt_i32_f32_e32 v0, s2
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_cvt_i32_f32_e32 v1, s5
|
||||
; SI-NEXT: v_cvt_i32_f32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fp_to_sint_v2i32:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_cvt_i32_f32_e32 v1, s3
|
||||
; VI-NEXT: v_cvt_i32_f32_e32 v0, s2
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-SDAG-LABEL: fp_to_sint_v2i32:
|
||||
@ -438,26 +435,25 @@ entry:
|
||||
define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %x) {
|
||||
; SI-LABEL: fp_to_sint_v2i64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s6, 0x2f800000
|
||||
; SI-NEXT: s_mov_b32 s7, 0xcf800000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_mov_b32 s8, 0x2f800000
|
||||
; SI-NEXT: s_mov_b32 s9, 0xcf800000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s0, s4
|
||||
; SI-NEXT: s_mov_b32 s1, s5
|
||||
; SI-NEXT: v_trunc_f32_e32 v0, s7
|
||||
; SI-NEXT: v_trunc_f32_e32 v1, s6
|
||||
; SI-NEXT: v_mul_f32_e64 v2, |v0|, s8
|
||||
; SI-NEXT: v_trunc_f32_e32 v0, s5
|
||||
; SI-NEXT: v_trunc_f32_e32 v1, s4
|
||||
; SI-NEXT: v_mul_f32_e64 v2, |v0|, s6
|
||||
; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
|
||||
; SI-NEXT: v_mul_f32_e64 v4, |v1|, s8
|
||||
; SI-NEXT: v_mul_f32_e64 v4, |v1|, s6
|
||||
; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v1
|
||||
; SI-NEXT: v_floor_f32_e32 v2, v2
|
||||
; SI-NEXT: v_floor_f32_e32 v4, v4
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v6, v2
|
||||
; SI-NEXT: v_fma_f32 v0, v2, s9, |v0|
|
||||
; SI-NEXT: v_fma_f32 v0, v2, s7, |v0|
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v2, v4
|
||||
; SI-NEXT: v_fma_f32 v1, v4, s9, |v1|
|
||||
; SI-NEXT: v_fma_f32 v1, v4, s7, |v1|
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; SI-NEXT: v_xor_b32_e32 v4, v6, v3
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
@ -474,36 +470,35 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %
|
||||
; VI-LABEL: fp_to_sint_v2i64:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_mov_b32 s8, 0x2f800000
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_mov_b32 s6, 0x2f800000
|
||||
; VI-NEXT: s_mov_b32 s7, 0xcf800000
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_trunc_f32_e32 v0, s3
|
||||
; VI-NEXT: v_mul_f32_e64 v1, |v0|, s8
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; VI-NEXT: v_trunc_f32_e32 v0, s5
|
||||
; VI-NEXT: v_mul_f32_e64 v1, |v0|, s6
|
||||
; VI-NEXT: v_floor_f32_e32 v1, v1
|
||||
; VI-NEXT: s_mov_b32 s0, 0xcf800000
|
||||
; VI-NEXT: v_fma_f32 v2, v1, s0, |v0|
|
||||
; VI-NEXT: v_trunc_f32_e32 v4, s2
|
||||
; VI-NEXT: v_cvt_u32_f32_e32 v2, v2
|
||||
; VI-NEXT: v_mul_f32_e64 v3, |v4|, s8
|
||||
; VI-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; VI-NEXT: v_floor_f32_e32 v3, v3
|
||||
; VI-NEXT: v_cvt_u32_f32_e32 v5, v3
|
||||
; VI-NEXT: v_fma_f32 v3, v3, s0, |v4|
|
||||
; VI-NEXT: v_cvt_u32_f32_e32 v2, v1
|
||||
; VI-NEXT: v_fma_f32 v1, v1, s7, |v0|
|
||||
; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v0
|
||||
; VI-NEXT: v_cvt_u32_f32_e32 v6, v3
|
||||
; VI-NEXT: v_xor_b32_e32 v2, v2, v0
|
||||
; VI-NEXT: v_trunc_f32_e32 v4, s4
|
||||
; VI-NEXT: v_xor_b32_e32 v3, v2, v0
|
||||
; VI-NEXT: v_mul_f32_e64 v2, |v4|, s6
|
||||
; VI-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; VI-NEXT: v_floor_f32_e32 v2, v2
|
||||
; VI-NEXT: v_cvt_u32_f32_e32 v5, v2
|
||||
; VI-NEXT: v_fma_f32 v2, v2, s7, |v4|
|
||||
; VI-NEXT: v_cvt_u32_f32_e32 v6, v2
|
||||
; VI-NEXT: v_xor_b32_e32 v1, v1, v0
|
||||
; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
|
||||
; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v0, vcc
|
||||
; VI-NEXT: v_sub_u32_e32 v2, vcc, v1, v0
|
||||
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v4
|
||||
; VI-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc
|
||||
; VI-NEXT: v_xor_b32_e32 v0, v6, v1
|
||||
; VI-NEXT: v_xor_b32_e32 v4, v5, v1
|
||||
; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-SDAG-LABEL: fp_to_sint_v2i64:
|
||||
@ -1298,32 +1293,29 @@ define amdgpu_kernel void @fp_to_sint_v2f32_to_v2i16(ptr addrspace(1) %out, <2 x
|
||||
; SI-LABEL: fp_to_sint_v2f32_to_v2i16:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_cvt_i32_f32_e32 v0, s3
|
||||
; SI-NEXT: v_cvt_i32_f32_e32 v1, s2
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_cvt_i32_f32_e32 v0, s5
|
||||
; SI-NEXT: v_cvt_i32_f32_e32 v1, s4
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; SI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fp_to_sint_v2f32_to_v2i16:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; VI-NEXT: v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI-NEXT: v_cvt_i32_f32_e32 v1, s2
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-SDAG-LABEL: fp_to_sint_v2f32_to_v2i16:
|
||||
|
||||
@ -72,27 +72,24 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x
|
||||
; SI-LABEL: fp_to_uint_v2f32_to_v2i32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v1, s3
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v0, s2
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v1, s5
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fp_to_uint_v2f32_to_v2i32:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_cvt_u32_f32_e32 v1, s3
|
||||
; VI-NEXT: v_cvt_u32_f32_e32 v0, s2
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-SDAG-LABEL: fp_to_uint_v2f32_to_v2i32:
|
||||
@ -349,32 +346,29 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x
|
||||
; SI-LABEL: fp_to_uint_v2f32_to_v2i64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s8, 0xcf800000
|
||||
; SI-NEXT: s_mov_b32 s6, 0xcf800000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_trunc_f32_e32 v0, s3
|
||||
; SI-NEXT: v_trunc_f32_e32 v2, s2
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_trunc_f32_e32 v0, s5
|
||||
; SI-NEXT: v_trunc_f32_e32 v2, s4
|
||||
; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
||||
; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
|
||||
; SI-NEXT: v_floor_f32_e32 v4, v1
|
||||
; SI-NEXT: v_floor_f32_e32 v5, v3
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v3, v4
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v1, v5
|
||||
; SI-NEXT: v_fma_f32 v0, v4, s8, v0
|
||||
; SI-NEXT: v_fma_f32 v4, v5, s8, v2
|
||||
; SI-NEXT: v_fma_f32 v0, v4, s6, v0
|
||||
; SI-NEXT: v_fma_f32 v4, v5, s6, v2
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v2, v0
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v0, v4
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fp_to_uint_v2f32_to_v2i64:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_trunc_f32_e32 v0, s3
|
||||
; VI-NEXT: v_trunc_f32_e32 v4, s2
|
||||
@ -389,9 +383,9 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x
|
||||
; VI-NEXT: v_cvt_u32_f32_e32 v3, v5
|
||||
; VI-NEXT: v_cvt_u32_f32_e32 v1, v6
|
||||
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-SDAG-LABEL: fp_to_uint_v2f32_to_v2i64:
|
||||
@ -1078,31 +1072,28 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i16(ptr addrspace(1) %out, <2 x
|
||||
; SI-LABEL: fp_to_uint_v2f32_to_v2i16:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v0, s3
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v0, s5
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v1, s2
|
||||
; SI-NEXT: v_cvt_u32_f32_e32 v1, s4
|
||||
; SI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fp_to_uint_v2f32_to_v2i16:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; VI-NEXT: v_cvt_u32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI-NEXT: v_cvt_u32_f32_e32 v1, s2
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-SDAG-LABEL: fp_to_uint_v2f32_to_v2i16:
|
||||
|
||||
@ -209,81 +209,85 @@ entry:
|
||||
define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
|
||||
; SI-LABEL: fshl_v2i32:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf
|
||||
; SI-NEXT: s_mov_b32 s11, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s10, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s8, s0
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
; SI-NEXT: s_mov_b32 s0, s5
|
||||
; SI-NEXT: s_mov_b32 s1, s3
|
||||
; SI-NEXT: s_lshr_b32 s12, s3, 1
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
|
||||
; SI-NEXT: s_not_b32 s3, s7
|
||||
; SI-NEXT: s_mov_b32 s1, s12
|
||||
; SI-NEXT: s_and_b32 s3, s3, 31
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
|
||||
; SI-NEXT: s_mov_b32 s5, s2
|
||||
; SI-NEXT: s_lshr_b32 s1, s2, 1
|
||||
; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], 1
|
||||
; SI-NEXT: s_mov_b32 s3, s1
|
||||
; SI-NEXT: s_not_b32 s1, s6
|
||||
; SI-NEXT: s_mov_b32 s6, s3
|
||||
; SI-NEXT: s_mov_b32 s7, s1
|
||||
; SI-NEXT: s_lshr_b32 s12, s1, 1
|
||||
; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
|
||||
; SI-NEXT: s_not_b32 s1, s5
|
||||
; SI-NEXT: s_mov_b32 s7, s12
|
||||
; SI-NEXT: s_and_b32 s1, s1, 31
|
||||
; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SI-NEXT: s_mov_b32 s3, s0
|
||||
; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1
|
||||
; SI-NEXT: s_lshr_b32 s5, s0, 1
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
|
||||
; SI-NEXT: s_not_b32 s2, s4
|
||||
; SI-NEXT: s_mov_b32 s1, s5
|
||||
; SI-NEXT: s_and_b32 s2, s2, 31
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s6
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fshl_v2i32:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x3c
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_mov_b32 s0, s5
|
||||
; VI-NEXT: s_mov_b32 s1, s3
|
||||
; VI-NEXT: s_lshr_b32 s8, s3, 1
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
|
||||
; VI-NEXT: s_not_b32 s3, s7
|
||||
; VI-NEXT: s_mov_b32 s1, s8
|
||||
; VI-NEXT: s_and_b32 s3, s3, 31
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
|
||||
; VI-NEXT: s_mov_b32 s5, s2
|
||||
; VI-NEXT: s_lshr_b32 s1, s2, 1
|
||||
; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 1
|
||||
; VI-NEXT: s_mov_b32 s3, s1
|
||||
; VI-NEXT: s_not_b32 s1, s6
|
||||
; VI-NEXT: s_mov_b32 s8, s3
|
||||
; VI-NEXT: s_mov_b32 s9, s1
|
||||
; VI-NEXT: s_lshr_b32 s10, s1, 1
|
||||
; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 1
|
||||
; VI-NEXT: s_not_b32 s1, s5
|
||||
; VI-NEXT: s_mov_b32 s9, s10
|
||||
; VI-NEXT: s_and_b32 s1, s1, 31
|
||||
; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_mov_b32 s3, s0
|
||||
; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1
|
||||
; VI-NEXT: s_lshr_b32 s5, s0, 1
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
|
||||
; VI-NEXT: s_not_b32 s2, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_and_b32 s2, s2, 31
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s8
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s7
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: fshl_v2i32:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s0, s13
|
||||
; GFX9-NEXT: s_mov_b32 s1, s11
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s11, 1
|
||||
; GFX9-NEXT: s_not_b32 s3, s15
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
|
||||
; GFX9-NEXT: s_mov_b32 s1, s2
|
||||
; GFX9-NEXT: s_and_b32 s2, s3, 31
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; GFX9-NEXT: s_mov_b32 s13, s10
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s10, 1
|
||||
; GFX9-NEXT: s_lshr_b64 s[2:3], s[12:13], 1
|
||||
; GFX9-NEXT: s_mov_b32 s3, s1
|
||||
; GFX9-NEXT: s_not_b32 s1, s14
|
||||
; GFX9-NEXT: s_mov_b32 s4, s3
|
||||
; GFX9-NEXT: s_mov_b32 s5, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s10, s1, 1
|
||||
; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
|
||||
; GFX9-NEXT: s_not_b32 s1, s9
|
||||
; GFX9-NEXT: s_mov_b32 s5, s10
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 31
|
||||
; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
|
||||
; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1
|
||||
; GFX9-NEXT: s_mov_b32 s3, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s5, s0, 1
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
|
||||
; GFX9-NEXT: s_not_b32 s2, s8
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; R600-LABEL: fshl_v2i32:
|
||||
@ -306,24 +310,27 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
;
|
||||
; GFX10-LABEL: fshl_v2i32:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
||||
; GFX10-NEXT: s_clause 0x2
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_mov_b32 s0, s13
|
||||
; GFX10-NEXT: s_mov_b32 s1, s11
|
||||
; GFX10-NEXT: s_not_b32 s2, s15
|
||||
; GFX10-NEXT: s_mov_b32 s13, s10
|
||||
; GFX10-NEXT: s_lshr_b32 s4, s11, 1
|
||||
; GFX10-NEXT: s_lshr_b32 s5, s10, 1
|
||||
; GFX10-NEXT: s_not_b32 s6, s14
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
|
||||
; GFX10-NEXT: s_and_b32 s7, s2, 31
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[12:13], 1
|
||||
; GFX10-NEXT: s_and_b32 s6, s6, 31
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: s_mov_b32 s1, s4
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s6
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s7
|
||||
; GFX10-NEXT: s_mov_b32 s4, s3
|
||||
; GFX10-NEXT: s_mov_b32 s5, s1
|
||||
; GFX10-NEXT: s_mov_b32 s3, s0
|
||||
; GFX10-NEXT: s_lshr_b32 s10, s1, 1
|
||||
; GFX10-NEXT: s_not_b32 s7, s7
|
||||
; GFX10-NEXT: s_lshr_b32 s11, s0, 1
|
||||
; GFX10-NEXT: s_not_b32 s6, s6
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
|
||||
; GFX10-NEXT: s_and_b32 s4, s7, 31
|
||||
; GFX10-NEXT: s_and_b32 s5, s6, 31
|
||||
; GFX10-NEXT: s_mov_b32 s3, s11
|
||||
; GFX10-NEXT: s_mov_b32 s1, s10
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
|
||||
@ -331,27 +338,30 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
;
|
||||
; GFX11-LABEL: fshl_v2i32:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_clause 0x2
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_mov_b32 s8, s5
|
||||
; GFX11-NEXT: s_mov_b32 s9, s3
|
||||
; GFX11-NEXT: s_mov_b32 s5, s2
|
||||
; GFX11-NEXT: s_lshr_b32 s10, s3, 1
|
||||
; GFX11-NEXT: s_mov_b32 s8, s3
|
||||
; GFX11-NEXT: s_mov_b32 s9, s1
|
||||
; GFX11-NEXT: s_mov_b32 s3, s0
|
||||
; GFX11-NEXT: s_lshr_b32 s10, s1, 1
|
||||
; GFX11-NEXT: s_not_b32 s7, s7
|
||||
; GFX11-NEXT: s_lshr_b32 s11, s2, 1
|
||||
; GFX11-NEXT: s_lshr_b32 s11, s0, 1
|
||||
; GFX11-NEXT: s_not_b32 s6, s6
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[8:9], 1
|
||||
; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
|
||||
; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], 1
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
|
||||
; GFX11-NEXT: s_and_b32 s7, s7, 31
|
||||
; GFX11-NEXT: s_and_b32 s6, s6, 31
|
||||
; GFX11-NEXT: s_mov_b32 s5, s11
|
||||
; GFX11-NEXT: s_mov_b32 s3, s10
|
||||
; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s7
|
||||
; GFX11-NEXT: s_mov_b32 s3, s11
|
||||
; GFX11-NEXT: s_mov_b32 s1, s10
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s6
|
||||
; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s7
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
%0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
|
||||
@ -362,54 +372,52 @@ entry:
|
||||
define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
|
||||
; SI-LABEL: fshl_v2i32_imm:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_mov_b32 s0, s9
|
||||
; SI-NEXT: s_mov_b32 s1, s3
|
||||
; SI-NEXT: s_mov_b32 s9, s2
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 23
|
||||
; SI-NEXT: s_lshr_b64 s[2:3], s[8:9], 25
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SI-NEXT: s_mov_b32 s8, s3
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
; SI-NEXT: s_mov_b32 s3, s0
|
||||
; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 23
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 25
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s8
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fshl_v2i32_imm:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_mov_b32 s0, s5
|
||||
; VI-NEXT: s_mov_b32 s1, s3
|
||||
; VI-NEXT: s_mov_b32 s5, s2
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], 23
|
||||
; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 25
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_mov_b32 s6, s3
|
||||
; VI-NEXT: s_mov_b32 s7, s1
|
||||
; VI-NEXT: s_mov_b32 s3, s0
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 23
|
||||
; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 25
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: fshl_v2i32_imm:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s4, s7
|
||||
; GFX9-NEXT: s_mov_b32 s5, s3
|
||||
; GFX9-NEXT: s_mov_b32 s7, s2
|
||||
; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 23
|
||||
; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 25
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX9-NEXT: s_mov_b32 s4, s3
|
||||
; GFX9-NEXT: s_mov_b32 s5, s1
|
||||
; GFX9-NEXT: s_mov_b32 s3, s0
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 23
|
||||
; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 25
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; R600-LABEL: fshl_v2i32_imm:
|
||||
@ -429,35 +437,35 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
|
||||
; GFX10-LABEL: fshl_v2i32_imm:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_mov_b32 s4, s7
|
||||
; GFX10-NEXT: s_mov_b32 s7, s2
|
||||
; GFX10-NEXT: s_mov_b32 s5, s3
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[6:7], 25
|
||||
; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 23
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: s_mov_b32 s4, s3
|
||||
; GFX10-NEXT: s_mov_b32 s3, s0
|
||||
; GFX10-NEXT: s_mov_b32 s5, s1
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], 25
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 23
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: fshl_v2i32_imm:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_mov_b32 s4, s7
|
||||
; GFX11-NEXT: s_mov_b32 s7, s2
|
||||
; GFX11-NEXT: s_mov_b32 s5, s3
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 25
|
||||
; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 23
|
||||
; GFX11-NEXT: s_mov_b32 s6, s3
|
||||
; GFX11-NEXT: s_mov_b32 s3, s0
|
||||
; GFX11-NEXT: s_mov_b32 s7, s1
|
||||
; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], 25
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 23
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
%0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
|
||||
|
||||
@ -325,56 +325,60 @@ entry:
|
||||
define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
|
||||
; SI-LABEL: fshr_v2i32:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf
|
||||
; SI-NEXT: s_mov_b32 s11, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s10, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s8, s0
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
; SI-NEXT: s_mov_b32 s0, s5
|
||||
; SI-NEXT: s_mov_b32 s1, s3
|
||||
; SI-NEXT: s_and_b32 s3, s7, 31
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
|
||||
; SI-NEXT: s_mov_b32 s5, s2
|
||||
; SI-NEXT: s_and_b32 s1, s6, 31
|
||||
; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SI-NEXT: s_mov_b32 s6, s3
|
||||
; SI-NEXT: s_mov_b32 s7, s1
|
||||
; SI-NEXT: s_and_b32 s1, s5, 31
|
||||
; SI-NEXT: s_mov_b32 s3, s0
|
||||
; SI-NEXT: s_and_b32 s0, s4, 31
|
||||
; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s6
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fshr_v2i32:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_mov_b32 s8, s3
|
||||
; VI-NEXT: s_mov_b32 s9, s1
|
||||
; VI-NEXT: s_and_b32 s1, s7, 31
|
||||
; VI-NEXT: s_mov_b32 s3, s0
|
||||
; VI-NEXT: s_and_b32 s0, s6, 31
|
||||
; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_mov_b32 s0, s5
|
||||
; VI-NEXT: s_mov_b32 s1, s3
|
||||
; VI-NEXT: s_and_b32 s3, s7, 31
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
|
||||
; VI-NEXT: s_mov_b32 s5, s2
|
||||
; VI-NEXT: s_and_b32 s1, s6, 31
|
||||
; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s8
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: fshr_v2i32:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
|
||||
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s0, s13
|
||||
; GFX9-NEXT: s_mov_b32 s1, s11
|
||||
; GFX9-NEXT: s_and_b32 s2, s15, 31
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; GFX9-NEXT: s_mov_b32 s13, s10
|
||||
; GFX9-NEXT: s_and_b32 s1, s14, 31
|
||||
; GFX9-NEXT: s_lshr_b64 s[2:3], s[12:13], s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: s_mov_b32 s4, s3
|
||||
; GFX9-NEXT: s_mov_b32 s5, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s7, 31
|
||||
; GFX9-NEXT: s_mov_b32 s3, s0
|
||||
; GFX9-NEXT: s_and_b32 s0, s6, 31
|
||||
; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
@ -394,53 +398,62 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
;
|
||||
; GFX10-LABEL: fshr_v2i32:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
||||
; GFX10-NEXT: s_clause 0x2
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_mov_b32 s0, s13
|
||||
; GFX10-NEXT: s_mov_b32 s1, s11
|
||||
; GFX10-NEXT: s_mov_b32 s13, s10
|
||||
; GFX10-NEXT: s_and_b32 s2, s14, 31
|
||||
; GFX10-NEXT: s_and_b32 s4, s15, 31
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[12:13], s2
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-NEXT: s_mov_b32 s4, s3
|
||||
; GFX10-NEXT: s_mov_b32 s5, s1
|
||||
; GFX10-NEXT: s_mov_b32 s3, s0
|
||||
; GFX10-NEXT: s_and_b32 s0, s6, 31
|
||||
; GFX10-NEXT: s_and_b32 s6, s7, 31
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: fshr_v2i32:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_clause 0x2
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_mov_b32 s8, s5
|
||||
; GFX11-NEXT: s_mov_b32 s9, s3
|
||||
; GFX11-NEXT: s_mov_b32 s5, s2
|
||||
; GFX11-NEXT: s_and_b32 s2, s6, 31
|
||||
; GFX11-NEXT: s_mov_b32 s8, s3
|
||||
; GFX11-NEXT: s_mov_b32 s9, s1
|
||||
; GFX11-NEXT: s_mov_b32 s3, s0
|
||||
; GFX11-NEXT: s_and_b32 s0, s6, 31
|
||||
; GFX11-NEXT: s_and_b32 s6, s7, 31
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], s2
|
||||
; GFX11-NEXT: s_lshr_b64 s[4:5], s[8:9], s6
|
||||
; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[8:9], s6
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: fshr_v2i32:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
||||
; GFX12-NEXT: s_clause 0x2
|
||||
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
|
||||
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_mov_b32 s8, s5
|
||||
; GFX12-NEXT: s_mov_b32 s9, s3
|
||||
; GFX12-NEXT: s_mov_b32 s5, s2
|
||||
; GFX12-NEXT: s_and_b32 s2, s6, 31
|
||||
; GFX12-NEXT: s_mov_b32 s8, s3
|
||||
; GFX12-NEXT: s_mov_b32 s9, s1
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_and_b32 s0, s6, 31
|
||||
; GFX12-NEXT: s_and_b32 s6, s7, 31
|
||||
; GFX12-NEXT: s_lshr_b64 s[2:3], s[4:5], s2
|
||||
; GFX12-NEXT: s_lshr_b64 s[4:5], s[8:9], s6
|
||||
; GFX12-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
|
||||
; GFX12-NEXT: s_lshr_b64 s[2:3], s[8:9], s6
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX12-NEXT: s_endpgm
|
||||
entry:
|
||||
%0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
|
||||
@ -451,54 +464,52 @@ entry:
|
||||
define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
|
||||
; SI-LABEL: fshr_v2i32_imm:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_mov_b32 s0, s9
|
||||
; SI-NEXT: s_mov_b32 s1, s3
|
||||
; SI-NEXT: s_mov_b32 s9, s2
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 9
|
||||
; SI-NEXT: s_lshr_b64 s[2:3], s[8:9], 7
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SI-NEXT: s_mov_b32 s8, s3
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
; SI-NEXT: s_mov_b32 s3, s0
|
||||
; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 9
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 7
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s8
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fshr_v2i32_imm:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_mov_b32 s0, s5
|
||||
; VI-NEXT: s_mov_b32 s1, s3
|
||||
; VI-NEXT: s_mov_b32 s5, s2
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], 9
|
||||
; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 7
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_mov_b32 s6, s3
|
||||
; VI-NEXT: s_mov_b32 s7, s1
|
||||
; VI-NEXT: s_mov_b32 s3, s0
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 9
|
||||
; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 7
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: fshr_v2i32_imm:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s4, s7
|
||||
; GFX9-NEXT: s_mov_b32 s5, s3
|
||||
; GFX9-NEXT: s_mov_b32 s7, s2
|
||||
; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 9
|
||||
; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX9-NEXT: s_mov_b32 s4, s3
|
||||
; GFX9-NEXT: s_mov_b32 s5, s1
|
||||
; GFX9-NEXT: s_mov_b32 s3, s0
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 9
|
||||
; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; R600-LABEL: fshr_v2i32_imm:
|
||||
@ -518,52 +529,52 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
|
||||
; GFX10-LABEL: fshr_v2i32_imm:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_mov_b32 s4, s7
|
||||
; GFX10-NEXT: s_mov_b32 s7, s2
|
||||
; GFX10-NEXT: s_mov_b32 s5, s3
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[6:7], 7
|
||||
; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 9
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: s_mov_b32 s4, s3
|
||||
; GFX10-NEXT: s_mov_b32 s3, s0
|
||||
; GFX10-NEXT: s_mov_b32 s5, s1
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], 7
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 9
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: fshr_v2i32_imm:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_mov_b32 s4, s7
|
||||
; GFX11-NEXT: s_mov_b32 s7, s2
|
||||
; GFX11-NEXT: s_mov_b32 s5, s3
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 7
|
||||
; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 9
|
||||
; GFX11-NEXT: s_mov_b32 s6, s3
|
||||
; GFX11-NEXT: s_mov_b32 s3, s0
|
||||
; GFX11-NEXT: s_mov_b32 s7, s1
|
||||
; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], 7
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 9
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: fshr_v2i32_imm:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
||||
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_mov_b32 s4, s7
|
||||
; GFX12-NEXT: s_mov_b32 s7, s2
|
||||
; GFX12-NEXT: s_mov_b32 s5, s3
|
||||
; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], 7
|
||||
; GFX12-NEXT: s_lshr_b64 s[4:5], s[4:5], 9
|
||||
; GFX12-NEXT: s_mov_b32 s6, s3
|
||||
; GFX12-NEXT: s_mov_b32 s3, s0
|
||||
; GFX12-NEXT: s_mov_b32 s7, s1
|
||||
; GFX12-NEXT: s_lshr_b64 s[0:1], s[2:3], 7
|
||||
; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], 9
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX12-NEXT: s_endpgm
|
||||
entry:
|
||||
%0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
|
||||
@ -574,63 +585,61 @@ entry:
|
||||
define amdgpu_kernel void @fshr_v2i32_imm_src1(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
|
||||
; SI-LABEL: fshr_v2i32_imm_src1:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s8, 9
|
||||
; SI-NEXT: s_mov_b32 s10, 7
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_mov_b32 s0, 9
|
||||
; SI-NEXT: s_mov_b32 s1, s3
|
||||
; SI-NEXT: s_and_b32 s3, s9, 31
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
|
||||
; SI-NEXT: s_mov_b32 s11, s2
|
||||
; SI-NEXT: s_and_b32 s1, s8, 31
|
||||
; SI-NEXT: s_lshr_b64 s[2:3], s[10:11], s1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
; SI-NEXT: s_and_b32 s1, s3, 31
|
||||
; SI-NEXT: s_mov_b32 s11, s0
|
||||
; SI-NEXT: s_and_b32 s0, s2, 31
|
||||
; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[10:11], s0
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s8
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fshr_v2i32_imm_src1:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; VI-NEXT: s_mov_b32 s6, 9
|
||||
; VI-NEXT: s_mov_b32 s8, 7
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_mov_b32 s7, s1
|
||||
; VI-NEXT: s_and_b32 s1, s3, 31
|
||||
; VI-NEXT: s_mov_b32 s9, s0
|
||||
; VI-NEXT: s_and_b32 s0, s2, 31
|
||||
; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_mov_b32 s7, s3
|
||||
; VI-NEXT: s_and_b32 s0, s5, 31
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], s0
|
||||
; VI-NEXT: s_mov_b32 s6, 7
|
||||
; VI-NEXT: s_mov_b32 s7, s2
|
||||
; VI-NEXT: s_and_b32 s1, s4, 31
|
||||
; VI-NEXT: s_lshr_b64 s[2:3], s[6:7], s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: fshr_v2i32_imm_src1:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX9-NEXT: s_mov_b32 s4, 9
|
||||
; GFX9-NEXT: s_mov_b32 s8, 7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s5, s3
|
||||
; GFX9-NEXT: s_and_b32 s3, s7, 31
|
||||
; GFX9-NEXT: s_mov_b32 s9, s2
|
||||
; GFX9-NEXT: s_and_b32 s2, s6, 31
|
||||
; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s3
|
||||
; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s5, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s3, 31
|
||||
; GFX9-NEXT: s_mov_b32 s9, s0
|
||||
; GFX9-NEXT: s_and_b32 s0, s2, 31
|
||||
; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; R600-LABEL: fshr_v2i32_imm_src1:
|
||||
@ -650,61 +659,61 @@ define amdgpu_kernel void @fshr_v2i32_imm_src1(ptr addrspace(1) %in, <2 x i32> %
|
||||
; GFX10-LABEL: fshr_v2i32_imm_src1:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX10-NEXT: s_mov_b32 s4, 9
|
||||
; GFX10-NEXT: s_mov_b32 s8, 7
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_mov_b32 s5, s3
|
||||
; GFX10-NEXT: s_mov_b32 s9, s2
|
||||
; GFX10-NEXT: s_and_b32 s2, s6, 31
|
||||
; GFX10-NEXT: s_and_b32 s6, s7, 31
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[8:9], s2
|
||||
; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: s_mov_b32 s5, s1
|
||||
; GFX10-NEXT: s_mov_b32 s9, s0
|
||||
; GFX10-NEXT: s_and_b32 s0, s2, 31
|
||||
; GFX10-NEXT: s_and_b32 s2, s3, 31
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s0
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: fshr_v2i32_imm_src1:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_mov_b32 s6, 9
|
||||
; GFX11-NEXT: s_mov_b32 s8, 7
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_mov_b32 s7, s3
|
||||
; GFX11-NEXT: s_mov_b32 s9, s2
|
||||
; GFX11-NEXT: s_and_b32 s2, s4, 31
|
||||
; GFX11-NEXT: s_and_b32 s4, s5, 31
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[8:9], s2
|
||||
; GFX11-NEXT: s_lshr_b64 s[4:5], s[6:7], s4
|
||||
; GFX11-NEXT: s_mov_b32 s7, s1
|
||||
; GFX11-NEXT: s_mov_b32 s9, s0
|
||||
; GFX11-NEXT: s_and_b32 s0, s2, 31
|
||||
; GFX11-NEXT: s_and_b32 s2, s3, 31
|
||||
; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s0
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], s2
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: fshr_v2i32_imm_src1:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: s_clause 0x1
|
||||
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
||||
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX12-NEXT: s_mov_b32 s6, 9
|
||||
; GFX12-NEXT: s_mov_b32 s8, 7
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_mov_b32 s7, s3
|
||||
; GFX12-NEXT: s_mov_b32 s9, s2
|
||||
; GFX12-NEXT: s_and_b32 s2, s4, 31
|
||||
; GFX12-NEXT: s_and_b32 s4, s5, 31
|
||||
; GFX12-NEXT: s_lshr_b64 s[2:3], s[8:9], s2
|
||||
; GFX12-NEXT: s_lshr_b64 s[4:5], s[6:7], s4
|
||||
; GFX12-NEXT: s_mov_b32 s7, s1
|
||||
; GFX12-NEXT: s_mov_b32 s9, s0
|
||||
; GFX12-NEXT: s_and_b32 s0, s2, 31
|
||||
; GFX12-NEXT: s_and_b32 s2, s3, 31
|
||||
; GFX12-NEXT: s_lshr_b64 s[0:1], s[8:9], s0
|
||||
; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], s2
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX12-NEXT: s_endpgm
|
||||
entry:
|
||||
%0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> <i32 7, i32 9>, <2 x i32> %y)
|
||||
|
||||
@ -9552,47 +9552,6 @@ define amdgpu_kernel void @atomic_sub_i16_soffset__amdgpu_no_remote_memory(ptr a
|
||||
; GFX9-NEXT: s_cbranch_execnz .LBB136_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: atomic_sub_i16_soffset__amdgpu_no_remote_memory:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_add_u32 s3, s0, 0x4650
|
||||
; GFX11-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX11-NEXT: s_and_b32 s0, s3, -4
|
||||
; GFX11-NEXT: s_and_b32 s3, s3, 3
|
||||
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
|
||||
; GFX11-NEXT: s_lshl_b32 s5, s3, 3
|
||||
; GFX11-NEXT: s_and_b32 s6, s2, 0xffff
|
||||
; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s5
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_not_b32 s3, s2
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX11-NEXT: s_lshl_b32 s4, s6, s5
|
||||
; GFX11-NEXT: s_mov_b32 s5, 0
|
||||
; GFX11-NEXT: .LBB136_1: ; %atomicrmw.start
|
||||
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_subrev_nc_u32_e32 v0, s4, v1
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, s2, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_and_or_b32 v0, v1, s3, v0
|
||||
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: buffer_gl1_inv
|
||||
; GFX11-NEXT: buffer_gl0_inv
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
|
||||
; GFX11-NEXT: s_cbranch_execnz .LBB136_1
|
||||
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX11-NEXT: s_endpgm
|
||||
%gep = getelementptr i16, ptr addrspace(1) %out, i64 9000
|
||||
%val = atomicrmw sub ptr addrspace(1) %gep, i16 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
|
||||
ret void
|
||||
@ -9712,47 +9671,6 @@ define amdgpu_kernel void @atomic_sub_i8_soffset__amdgpu_no_remote_memory(ptr ad
|
||||
; GFX9-NEXT: s_cbranch_execnz .LBB137_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: atomic_sub_i8_soffset__amdgpu_no_remote_memory:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_add_u32 s3, s0, 0x2328
|
||||
; GFX11-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX11-NEXT: s_and_b32 s0, s3, -4
|
||||
; GFX11-NEXT: s_and_b32 s3, s3, 3
|
||||
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
|
||||
; GFX11-NEXT: s_lshl_b32 s5, s3, 3
|
||||
; GFX11-NEXT: s_and_b32 s6, s2, 0xff
|
||||
; GFX11-NEXT: s_lshl_b32 s2, 0xff, s5
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_not_b32 s3, s2
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX11-NEXT: s_lshl_b32 s4, s6, s5
|
||||
; GFX11-NEXT: s_mov_b32 s5, 0
|
||||
; GFX11-NEXT: .LBB137_1: ; %atomicrmw.start
|
||||
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_subrev_nc_u32_e32 v0, s4, v1
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, s2, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_and_or_b32 v0, v1, s3, v0
|
||||
; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: buffer_gl1_inv
|
||||
; GFX11-NEXT: buffer_gl0_inv
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
|
||||
; GFX11-NEXT: s_cbranch_execnz .LBB137_1
|
||||
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX11-NEXT: s_endpgm
|
||||
%gep = getelementptr i8, ptr addrspace(1) %out, i64 9000
|
||||
%val = atomicrmw sub ptr addrspace(1) %gep, i8 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
|
||||
ret void
|
||||
|
||||
@ -98,16 +98,16 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
|
||||
; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CIVI-NEXT: s_add_u32 s4, s0, 4
|
||||
; CIVI-NEXT: s_addc_u32 s5, s1, 0
|
||||
; CIVI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; CIVI-NEXT: v_mov_b32_e32 v4, s3
|
||||
; CIVI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CIVI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; CIVI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CIVI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; CIVI-NEXT: s_add_u32 s0, s0, 4
|
||||
; CIVI-NEXT: flat_store_dword v[0:1], v2
|
||||
; CIVI-NEXT: s_addc_u32 s1, s1, 0
|
||||
; CIVI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CIVI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CIVI-NEXT: v_mov_b32_e32 v2, s3
|
||||
; CIVI-NEXT: flat_store_short v[0:1], v2
|
||||
; CIVI-NEXT: v_mov_b32_e32 v5, s2
|
||||
; CIVI-NEXT: flat_store_short v[2:3], v4
|
||||
; CIVI-NEXT: flat_store_dword v[0:1], v5
|
||||
; CIVI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: load_v3f16_arg:
|
||||
@ -135,8 +135,8 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
|
||||
; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CIVI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CIVI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CIVI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; CIVI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CIVI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; CIVI-NEXT: s_endpgm
|
||||
@ -144,9 +144,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
|
||||
; GFX11-LABEL: load_v4f16_arg:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
store <4 x half> %arg, ptr addrspace(1) %out
|
||||
@ -348,21 +348,37 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 {
|
||||
; CIVI-LABEL: extload_v3f16_to_v3f32_arg:
|
||||
; CIVI: ; %bb.0:
|
||||
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; CIVI-NEXT: s_add_i32 s12, s12, s17
|
||||
; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CIVI-NEXT: s_lshr_b32 s4, s2, 16
|
||||
; CIVI-NEXT: v_cvt_f32_f16_e32 v2, s3
|
||||
; CIVI-NEXT: v_cvt_f32_f16_e32 v1, s4
|
||||
; CIVI-NEXT: v_cvt_f32_f16_e32 v0, s2
|
||||
; CIVI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; CIVI-NEXT: v_mov_b32_e32 v4, s1
|
||||
; CIVI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
|
||||
; CIVI-NEXT: s_endpgm
|
||||
; CI-LABEL: extload_v3f16_to_v3f32_arg:
|
||||
; CI: ; %bb.0:
|
||||
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; CI-NEXT: s_add_i32 s12, s12, s17
|
||||
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_lshr_b32 s4, s2, 16
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
|
||||
; CI-NEXT: v_mov_b32_e32 v4, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: extload_v3f16_to_v3f32_arg:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; VI-NEXT: s_add_i32 s12, s12, s17
|
||||
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshr_b32 s4, s2, 16
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v1, s4
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: extload_v3f16_to_v3f32_arg:
|
||||
; GFX11: ; %bb.0:
|
||||
@ -370,9 +386,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
|
||||
; GFX11-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_lshr_b32 s4, s2, 16
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3
|
||||
; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
%ext = fpext <3 x half> %arg to <3 x float>
|
||||
@ -388,14 +404,14 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
|
||||
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_lshr_b32 s4, s2, 16
|
||||
; CI-NEXT: s_lshr_b32 s5, s3, 16
|
||||
; CI-NEXT: s_lshr_b32 s4, s3, 16
|
||||
; CI-NEXT: s_lshr_b32 s5, s2, 16
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v3, s5
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v3, s4
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v1, s5
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
|
||||
; CI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
@ -408,12 +424,12 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshr_b32 s4, s3, 16
|
||||
; VI-NEXT: s_lshr_b32 s5, s2, 16
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v3, s4
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v1, s5
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@ -424,10 +440,10 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_lshr_b32 s4, s3, 16
|
||||
; GFX11-NEXT: s_lshr_b32 s5, s2, 16
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s4
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s5
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3
|
||||
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
%ext = fpext <4 x half> %arg to <4 x float>
|
||||
@ -708,33 +724,61 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 {
|
||||
; CIVI-LABEL: extload_v4f16_to_v4f64_arg:
|
||||
; CIVI: ; %bb.0:
|
||||
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; CIVI-NEXT: s_add_i32 s12, s12, s17
|
||||
; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CIVI-NEXT: s_lshr_b32 s5, s3, 16
|
||||
; CIVI-NEXT: v_cvt_f32_f16_e32 v0, s3
|
||||
; CIVI-NEXT: v_cvt_f32_f16_e32 v2, s5
|
||||
; CIVI-NEXT: s_lshr_b32 s4, s2, 16
|
||||
; CIVI-NEXT: v_cvt_f32_f16_e32 v4, s2
|
||||
; CIVI-NEXT: v_cvt_f32_f16_e32 v6, s4
|
||||
; CIVI-NEXT: s_add_u32 s2, s0, 16
|
||||
; CIVI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
|
||||
; CIVI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
|
||||
; CIVI-NEXT: s_addc_u32 s3, s1, 0
|
||||
; CIVI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
|
||||
; CIVI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
|
||||
; CIVI-NEXT: v_mov_b32_e32 v9, s3
|
||||
; CIVI-NEXT: v_mov_b32_e32 v8, s2
|
||||
; CIVI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
|
||||
; CIVI-NEXT: s_nop 0
|
||||
; CIVI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CIVI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CIVI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
|
||||
; CIVI-NEXT: s_endpgm
|
||||
; CI-LABEL: extload_v4f16_to_v4f64_arg:
|
||||
; CI: ; %bb.0:
|
||||
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; CI-NEXT: s_add_i32 s12, s12, s17
|
||||
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_lshr_b32 s4, s3, 16
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v2, s4
|
||||
; CI-NEXT: s_lshr_b32 s5, s2, 16
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
|
||||
; CI-NEXT: v_cvt_f32_f16_e32 v6, s5
|
||||
; CI-NEXT: s_add_u32 s2, s0, 16
|
||||
; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
|
||||
; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
|
||||
; CI-NEXT: s_addc_u32 s3, s1, 0
|
||||
; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
|
||||
; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
|
||||
; CI-NEXT: v_mov_b32_e32 v9, s3
|
||||
; CI-NEXT: v_mov_b32_e32 v8, s2
|
||||
; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
|
||||
; CI-NEXT: s_nop 0
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: extload_v4f16_to_v4f64_arg:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; VI-NEXT: s_add_i32 s12, s12, s17
|
||||
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshr_b32 s5, s3, 16
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v0, s3
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v2, s5
|
||||
; VI-NEXT: s_lshr_b32 s4, s2, 16
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v4, s2
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v6, s4
|
||||
; VI-NEXT: s_add_u32 s2, s0, 16
|
||||
; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
|
||||
; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
|
||||
; VI-NEXT: s_addc_u32 s3, s1, 0
|
||||
; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
|
||||
; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
|
||||
; VI-NEXT: v_mov_b32_e32 v9, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v8, s2
|
||||
; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
|
||||
; VI-NEXT: s_nop 0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: extload_v4f16_to_v4f64_arg:
|
||||
; GFX11: ; %bb.0:
|
||||
|
||||
@ -290,19 +290,19 @@ define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec,
|
||||
; GCN-LABEL: half4_inselt:
|
||||
; GCN: ; %bb.0: ; %entry
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCN-NEXT: s_load_dword s5, s[4:5], 0x34
|
||||
; GCN-NEXT: s_load_dword s6, s[4:5], 0x34
|
||||
; GCN-NEXT: s_mov_b32 s4, 0x3c003c00
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: s_lshl_b32 s0, s5, 4
|
||||
; GCN-NEXT: s_mov_b32 s5, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: s_lshl_b64 s[0:1], 0xffff, s0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5]
|
||||
; GCN-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
|
||||
; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GCN-NEXT: s_lshl_b32 s6, s6, 4
|
||||
; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6
|
||||
; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
|
||||
; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GCN-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -418,19 +418,19 @@ define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec,
|
||||
; GCN-LABEL: short4_inselt:
|
||||
; GCN: ; %bb.0: ; %entry
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCN-NEXT: s_load_dword s5, s[4:5], 0x34
|
||||
; GCN-NEXT: s_load_dword s6, s[4:5], 0x34
|
||||
; GCN-NEXT: s_mov_b32 s4, 0x10001
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: s_lshl_b32 s0, s5, 4
|
||||
; GCN-NEXT: s_mov_b32 s5, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: s_lshl_b64 s[0:1], 0xffff, s0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5]
|
||||
; GCN-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
|
||||
; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GCN-NEXT: s_lshl_b32 s6, s6, 4
|
||||
; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6
|
||||
; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
|
||||
; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GCN-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -443,18 +443,18 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
|
||||
; GCN-LABEL: byte8_inselt:
|
||||
; GCN: ; %bb.0: ; %entry
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCN-NEXT: s_load_dword s4, s[4:5], 0x34
|
||||
; GCN-NEXT: s_load_dword s6, s[4:5], 0x34
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_xor_b32 s5, s3, 0x1010101
|
||||
; GCN-NEXT: s_lshl_b32 s6, s6, 3
|
||||
; GCN-NEXT: s_xor_b32 s4, s2, 0x1010101
|
||||
; GCN-NEXT: s_lshl_b64 s[6:7], 0xff, s6
|
||||
; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
|
||||
; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: s_lshl_b32 s4, s4, 3
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: s_xor_b32 s1, s3, 0x1010101
|
||||
; GCN-NEXT: s_xor_b32 s0, s2, 0x1010101
|
||||
; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4
|
||||
; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
|
||||
; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GCN-NEXT: s_endpgm
|
||||
entry:
|
||||
|
||||
@ -1571,13 +1571,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_lshl_b32 s0, s8, 4
|
||||
; VI-NEXT: s_mov_b32 s8, 0x50005
|
||||
; VI-NEXT: s_mov_b32 s9, s8
|
||||
; VI-NEXT: s_mov_b32 s0, 0x50005
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0
|
||||
; VI-NEXT: s_xor_b64 s[8:9], s[2:3], s[8:9]
|
||||
; VI-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1]
|
||||
; VI-NEXT: s_mov_b32 s1, s0
|
||||
; VI-NEXT: s_lshl_b32 s8, s8, 4
|
||||
; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
|
||||
; VI-NEXT: s_lshl_b64 s[8:9], 0xffff, s8
|
||||
; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9]
|
||||
; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
|
||||
|
||||
@ -734,8 +734,8 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32>
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -797,8 +797,8 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -1000,16 +1000,16 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_add_u32 s4, s0, 4
|
||||
; VI-NEXT: s_addc_u32 s5, s1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: s_add_u32 s0, s0, 4
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_addc_u32 s1, s1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s3
|
||||
; VI-NEXT: flat_store_short v[0:1], v2
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s2
|
||||
; VI-NEXT: flat_store_short v[2:3], v4
|
||||
; VI-NEXT: flat_store_dword v[0:1], v5
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: v3i16_arg:
|
||||
@ -1335,8 +1335,8 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -2400,8 +2400,8 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_endpgm
|
||||
|
||||
@ -139,26 +139,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
|
||||
; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, 0
|
||||
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s3
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
|
||||
; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3]
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
|
||||
; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
|
||||
@ -183,51 +177,39 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
|
||||
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
|
||||
; HEURRC: ; %bb.0:
|
||||
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v12, 0
|
||||
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v4, s12
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v5, s13
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v6, s14
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v7, s15
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v8, s0
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v9, s1
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v10, s2
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v11, s3
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; HEURRC-NEXT: s_nop 1
|
||||
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
|
||||
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3]
|
||||
; HEURRC-NEXT: s_nop 7
|
||||
; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
|
||||
; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; HEURRC-NEXT: s_endpgm
|
||||
;
|
||||
; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
|
||||
; VGPRRC: ; %bb.0:
|
||||
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
|
||||
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; VGPRRC-NEXT: s_nop 1
|
||||
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
|
||||
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3]
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; VGPRRC-NEXT: s_endpgm
|
||||
; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
|
||||
; AGPR: ; %bb.0:
|
||||
@ -276,26 +258,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
|
||||
; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; SDAG-NEXT: v_mov_b32_e32 v12, 0
|
||||
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, 0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SDAG-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s10
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s11
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s12
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s13
|
||||
; SDAG-NEXT: v_mov_b32_e32 v6, s14
|
||||
; SDAG-NEXT: v_mov_b32_e32 v7, s15
|
||||
; SDAG-NEXT: v_mov_b32_e32 v8, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v9, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v10, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v11, s3
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
|
||||
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; SDAG-NEXT: s_nop 1
|
||||
; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
|
||||
; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
|
||||
; SDAG-NEXT: s_nop 7
|
||||
; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
|
||||
; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
|
||||
@ -320,51 +296,39 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
|
||||
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
|
||||
; HEURRC: ; %bb.0:
|
||||
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v12, 0
|
||||
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v4, s12
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v5, s13
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v6, s14
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v7, s15
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v8, s0
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v9, s1
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v10, s2
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v11, s3
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; HEURRC-NEXT: s_nop 1
|
||||
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
|
||||
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
|
||||
; HEURRC-NEXT: s_nop 7
|
||||
; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
|
||||
; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; HEURRC-NEXT: s_endpgm
|
||||
;
|
||||
; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
|
||||
; VGPRRC: ; %bb.0:
|
||||
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
|
||||
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; VGPRRC-NEXT: s_nop 1
|
||||
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
|
||||
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; VGPRRC-NEXT: s_endpgm
|
||||
; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
|
||||
; AGPR: ; %bb.0:
|
||||
@ -5455,76 +5419,58 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
|
||||
; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; GCN-NEXT: v_mov_b32_e32 v12, 0
|
||||
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s10
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s11
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s12
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s13
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s14
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s15
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v9, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v10, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v11, s3
|
||||
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
|
||||
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
|
||||
; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; GCN-NEXT: s_endpgm
|
||||
;
|
||||
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
|
||||
; HEURRC: ; %bb.0:
|
||||
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v12, 0
|
||||
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v4, s12
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v5, s13
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v6, s14
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v7, s15
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v8, s0
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v9, s1
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v10, s2
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v11, s3
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; HEURRC-NEXT: s_nop 1
|
||||
; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
|
||||
; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3]
|
||||
; HEURRC-NEXT: s_nop 7
|
||||
; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
|
||||
; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; HEURRC-NEXT: s_endpgm
|
||||
;
|
||||
; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
|
||||
; VGPRRC: ; %bb.0:
|
||||
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
|
||||
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; VGPRRC-NEXT: s_nop 1
|
||||
; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
|
||||
; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3]
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; VGPRRC-NEXT: s_endpgm
|
||||
; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
|
||||
; AGPR: ; %bb.0:
|
||||
@ -5573,76 +5519,58 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
|
||||
; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; GCN-NEXT: v_mov_b32_e32 v12, 0
|
||||
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s10
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s11
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s12
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s13
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s14
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s15
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v9, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v10, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v11, s3
|
||||
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
|
||||
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
|
||||
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
|
||||
; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; GCN-NEXT: s_endpgm
|
||||
;
|
||||
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
|
||||
; HEURRC: ; %bb.0:
|
||||
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v12, 0
|
||||
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v4, s12
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v5, s13
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v6, s14
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v7, s15
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v8, s0
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v9, s1
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v10, s2
|
||||
; HEURRC-NEXT: v_mov_b32_e32 v11, s3
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
|
||||
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; HEURRC-NEXT: s_nop 1
|
||||
; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
|
||||
; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
|
||||
; HEURRC-NEXT: s_nop 7
|
||||
; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
|
||||
; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; HEURRC-NEXT: s_endpgm
|
||||
;
|
||||
; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
|
||||
; VGPRRC: ; %bb.0:
|
||||
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
|
||||
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
|
||||
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
|
||||
; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
|
||||
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; VGPRRC-NEXT: s_nop 1
|
||||
; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
|
||||
; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
|
||||
; VGPRRC-NEXT: s_nop 7
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
|
||||
; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; VGPRRC-NEXT: s_endpgm
|
||||
; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
|
||||
; AGPR: ; %bb.0:
|
||||
|
||||
@ -339,53 +339,53 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
|
||||
; VI-SDAG-LABEL: s_exp_v2f32:
|
||||
; VI-SDAG: ; %bb.0:
|
||||
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x3fb8a000
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
|
||||
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v1, s3, v1
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v1
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v2
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295
|
||||
; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v0
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v1, v5, v1
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; VI-SDAG-NEXT: v_exp_f32_e32 v5, v0
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-SDAG-NEXT: s_and_b32 s0, s2, 0xfffff000
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v8, s0
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v8, s2, v8
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v8
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3fb8a000, v8
|
||||
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
|
||||
; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v8, v8, v9
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v8
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
|
||||
; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
|
||||
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v7
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v3, v5, v3
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0
|
||||
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000
|
||||
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, v4
|
||||
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
||||
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
|
||||
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4
|
||||
; VI-SDAG-NEXT: s_and_b32 s4, s2, 0xfffff000
|
||||
; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v6, s2, v6
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v6
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6
|
||||
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
|
||||
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
|
||||
; VI-SDAG-NEXT: v_rndne_f32_e32 v5, v0
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v4
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
|
||||
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
|
||||
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v5
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0
|
||||
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v2
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42b17218
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x7f800000
|
||||
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4
|
||||
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
||||
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; VI-GISEL-LABEL: s_exp_v2f32:
|
||||
@ -520,42 +520,41 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
|
||||
;
|
||||
; SI-SDAG-LABEL: s_exp_v2f32:
|
||||
; SI-SDAG: ; %bb.0:
|
||||
; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
||||
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f
|
||||
; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-SDAG-NEXT: s_mov_b32 s2, -1
|
||||
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0
|
||||
; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v0
|
||||
; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2
|
||||
; SI-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2
|
||||
; SI-SDAG-NEXT: v_fma_f32 v4, s5, v0, -v2
|
||||
; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
|
||||
; SI-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4
|
||||
; SI-SDAG-NEXT: v_fma_f32 v4, s5, v1, v4
|
||||
; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
|
||||
; SI-SDAG-NEXT: v_mul_f32_e32 v5, s6, v0
|
||||
; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
|
||||
; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0
|
||||
; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
|
||||
; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
|
||||
; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v5
|
||||
; SI-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v5
|
||||
; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v5
|
||||
; SI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6
|
||||
; SI-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0
|
||||
; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0
|
||||
; SI-SDAG-NEXT: v_add_f32_e32 v0, v7, v0
|
||||
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
|
||||
; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v5, v6
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2ce8ed0
|
||||
; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v3
|
||||
; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v3
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x42b17218
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x7f800000
|
||||
; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v4
|
||||
; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v4
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v5
|
||||
; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3
|
||||
; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
||||
; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4
|
||||
; SI-SDAG-NEXT: s_mov_b32 s0, s4
|
||||
; SI-SDAG-NEXT: s_mov_b32 s1, s5
|
||||
; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v4
|
||||
; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-SDAG-NEXT: s_mov_b32 s2, -1
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
|
||||
; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-SDAG-NEXT: s_endpgm
|
||||
|
||||
@ -341,53 +341,53 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
|
||||
; VI-SDAG-LABEL: s_exp10_v2f32:
|
||||
; VI-SDAG: ; %bb.0:
|
||||
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x40549000
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000
|
||||
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v1, s3, v1
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v1
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v2
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc
|
||||
; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v0
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v1, v5, v1
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; VI-SDAG-NEXT: v_exp_f32_e32 v5, v0
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-SDAG-NEXT: s_and_b32 s0, s2, 0xfffff000
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v8, s0
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v8, s2, v8
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v8
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x40549000, v8
|
||||
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
|
||||
; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v8, v8, v9
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v8
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
|
||||
; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
|
||||
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v7
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v3, v5, v3
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4
|
||||
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000
|
||||
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, v4
|
||||
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
||||
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
|
||||
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4
|
||||
; VI-SDAG-NEXT: s_and_b32 s4, s2, 0xfffff000
|
||||
; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v6, s2, v6
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v6
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x40549000, v6
|
||||
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
|
||||
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
|
||||
; VI-SDAG-NEXT: v_rndne_f32_e32 v5, v0
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v4
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
|
||||
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
|
||||
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v5
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc23369f4
|
||||
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v2
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x421a209b
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x7f800000
|
||||
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4
|
||||
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
||||
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; VI-GISEL-LABEL: s_exp10_v2f32:
|
||||
@ -522,42 +522,41 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
|
||||
;
|
||||
; SI-SDAG-LABEL: s_exp10_v2f32:
|
||||
; SI-SDAG: ; %bb.0:
|
||||
; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
||||
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37
|
||||
; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-SDAG-NEXT: s_mov_b32 s2, -1
|
||||
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0
|
||||
; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v0
|
||||
; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2
|
||||
; SI-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2
|
||||
; SI-SDAG-NEXT: v_fma_f32 v4, s5, v0, -v2
|
||||
; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
|
||||
; SI-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4
|
||||
; SI-SDAG-NEXT: v_fma_f32 v4, s5, v1, v4
|
||||
; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
|
||||
; SI-SDAG-NEXT: v_mul_f32_e32 v5, s6, v0
|
||||
; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
|
||||
; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0
|
||||
; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
|
||||
; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
|
||||
; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v5
|
||||
; SI-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v5
|
||||
; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v5
|
||||
; SI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6
|
||||
; SI-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0
|
||||
; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0
|
||||
; SI-SDAG-NEXT: v_add_f32_e32 v0, v7, v0
|
||||
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
|
||||
; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v5, v6
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0xc23369f4
|
||||
; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v3
|
||||
; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v3
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x421a209b
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x7f800000
|
||||
; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v4
|
||||
; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v4
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v5
|
||||
; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3
|
||||
; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
||||
; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4
|
||||
; SI-SDAG-NEXT: s_mov_b32 s0, s4
|
||||
; SI-SDAG-NEXT: s_mov_b32 s1, s5
|
||||
; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v4
|
||||
; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-SDAG-NEXT: s_mov_b32 s2, -1
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
|
||||
; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-SDAG-NEXT: s_endpgm
|
||||
|
||||
@ -176,26 +176,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
|
||||
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000
|
||||
; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-SDAG-NEXT: s_mov_b32 s6, -1
|
||||
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
|
||||
; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
|
||||
; SI-SDAG-NEXT: s_mov_b32 s4, s0
|
||||
; SI-SDAG-NEXT: s_mov_b32 s5, s1
|
||||
; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
|
||||
; SI-SDAG-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
|
||||
; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
|
||||
; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
|
||||
; SI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0
|
||||
; SI-SDAG-NEXT: v_add_f32_e32 v2, s5, v2
|
||||
; SI-SDAG-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
|
||||
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
|
||||
; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e64 v1, v2, s0
|
||||
; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
|
||||
; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, v0, s0
|
||||
; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-SDAG-NEXT: s_cselect_b32 s6, 0xffffffc0, 0
|
||||
; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
|
||||
; SI-SDAG-NEXT: s_cselect_b32 s4, 0xffffffc0, 0
|
||||
; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-SDAG-NEXT: s_mov_b32 s2, -1
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e64 v1, v2, s6
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, v0, s4
|
||||
; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; SI-GISEL-LABEL: s_exp2_v2f32:
|
||||
@ -225,26 +224,26 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
|
||||
; VI-SDAG-LABEL: s_exp2_v2f32:
|
||||
; VI-SDAG: ; %bb.0:
|
||||
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc2fc0000
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000
|
||||
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
|
||||
; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
|
||||
; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
|
||||
; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
|
||||
; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v3, v4, s0
|
||||
; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
|
||||
; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, s0
|
||||
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0
|
||||
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
|
||||
; VI-SDAG-NEXT: s_cselect_b32 s3, 0xffffffc0, 0
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, s3
|
||||
; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
|
||||
; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s2
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; VI-GISEL-LABEL: s_exp2_v2f32:
|
||||
|
||||
@ -321,39 +321,38 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
|
||||
define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
|
||||
; SI-SDAG-LABEL: s_log_v2f32:
|
||||
; SI-SDAG: ; %bb.0:
|
||||
; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
||||
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218
|
||||
; SI-SDAG-NEXT: s_mov_b32 s8, 0x3377d1cf
|
||||
; SI-SDAG-NEXT: s_mov_b32 s8, 0x3f317217
|
||||
; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000
|
||||
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
|
||||
; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
|
||||
; SI-SDAG-NEXT: s_cselect_b32 s0, 32, 0
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v3, s0
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s7, v3
|
||||
; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
|
||||
; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
|
||||
; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v3, s2
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s5, v3
|
||||
; SI-SDAG-NEXT: v_log_f32_e32 v3, v3
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
|
||||
; SI-SDAG-NEXT: s_mov_b32 s0, s4
|
||||
; SI-SDAG-NEXT: s_mov_b32 s1, s5
|
||||
; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
|
||||
; SI-SDAG-NEXT: s_mov_b32 s7, 0x3f317217
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
|
||||
; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec
|
||||
; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v3
|
||||
; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
|
||||
; SI-SDAG-NEXT: v_fma_f32 v5, v3, s7, -v4
|
||||
; SI-SDAG-NEXT: s_cselect_b32 s6, 32, 0
|
||||
; SI-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf
|
||||
; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, -v4
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v1, s4
|
||||
; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, v5
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v1, s6
|
||||
; SI-SDAG-NEXT: v_fma_f32 v5, v3, s5, v5
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s4, v1
|
||||
; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
|
||||
; SI-SDAG-NEXT: v_log_f32_e32 v5, v1
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s9
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2
|
||||
; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v5
|
||||
; SI-SDAG-NEXT: v_fma_f32 v3, v5, s7, -v2
|
||||
; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, v3
|
||||
; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, -v2
|
||||
; SI-SDAG-NEXT: v_fma_f32 v3, v5, s5, v3
|
||||
; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s9
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
|
||||
@ -406,51 +405,51 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
|
||||
; VI-SDAG-LABEL: s_log_v2f32:
|
||||
; VI-SDAG: ; %bb.0:
|
||||
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x800000
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x41b17218
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218
|
||||
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
|
||||
; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
|
||||
; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v0, s3, v0
|
||||
; VI-SDAG-NEXT: v_log_f32_e32 v5, v0
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-SDAG-NEXT: v_and_b32_e32 v6, 0xfffff000, v5
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6
|
||||
; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v6
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3f317000, v7
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v7
|
||||
; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7
|
||||
; VI-SDAG-NEXT: v_log_f32_e32 v7, v3
|
||||
; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v4
|
||||
; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v7
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v5, v7, v4
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v5
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3
|
||||
; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
|
||||
; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
|
||||
; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v4
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v5
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v4
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v5, v8, v5
|
||||
; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s3
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2
|
||||
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-SDAG-NEXT: v_log_f32_e32 v5, v1
|
||||
; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2
|
||||
; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v5
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v3
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v2
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v3, v6, v3
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; VI-GISEL-LABEL: s_log_v2f32:
|
||||
|
||||
@ -321,39 +321,38 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
|
||||
define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
|
||||
; SI-SDAG-LABEL: s_log10_v2f32:
|
||||
; SI-SDAG: ; %bb.0:
|
||||
; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
||||
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b
|
||||
; SI-SDAG-NEXT: s_mov_b32 s8, 0x3284fbcf
|
||||
; SI-SDAG-NEXT: s_mov_b32 s8, 0x3e9a209a
|
||||
; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000
|
||||
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
|
||||
; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
|
||||
; SI-SDAG-NEXT: s_cselect_b32 s0, 32, 0
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v3, s0
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s7, v3
|
||||
; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
|
||||
; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
|
||||
; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v3, s2
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s5, v3
|
||||
; SI-SDAG-NEXT: v_log_f32_e32 v3, v3
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
|
||||
; SI-SDAG-NEXT: s_mov_b32 s0, s4
|
||||
; SI-SDAG-NEXT: s_mov_b32 s1, s5
|
||||
; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
|
||||
; SI-SDAG-NEXT: s_mov_b32 s7, 0x3e9a209a
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
|
||||
; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec
|
||||
; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v3
|
||||
; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
|
||||
; SI-SDAG-NEXT: v_fma_f32 v5, v3, s7, -v4
|
||||
; SI-SDAG-NEXT: s_cselect_b32 s6, 32, 0
|
||||
; SI-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf
|
||||
; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, -v4
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v1, s4
|
||||
; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, v5
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v1, s6
|
||||
; SI-SDAG-NEXT: v_fma_f32 v5, v3, s5, v5
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s4, v1
|
||||
; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
|
||||
; SI-SDAG-NEXT: v_log_f32_e32 v5, v1
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s9
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2
|
||||
; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v5
|
||||
; SI-SDAG-NEXT: v_fma_f32 v3, v5, s7, -v2
|
||||
; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, v3
|
||||
; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, -v2
|
||||
; SI-SDAG-NEXT: v_fma_f32 v3, v5, s5, v3
|
||||
; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s9
|
||||
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
|
||||
@ -406,51 +405,51 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
|
||||
; VI-SDAG-LABEL: s_log10_v2f32:
|
||||
; VI-SDAG: ; %bb.0:
|
||||
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x800000
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x411a209b
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b
|
||||
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
|
||||
; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
|
||||
; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v0, s3, v0
|
||||
; VI-SDAG-NEXT: v_log_f32_e32 v5, v0
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-SDAG-NEXT: v_and_b32_e32 v6, 0xfffff000, v5
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6
|
||||
; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v6
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3e9a2000, v7
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v7
|
||||
; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7
|
||||
; VI-SDAG-NEXT: v_log_f32_e32 v7, v3
|
||||
; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v4
|
||||
; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v7
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v5, v7, v4
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v5
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3
|
||||
; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
|
||||
; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
|
||||
; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v4
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v5
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v4
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v5, v8, v5
|
||||
; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s3
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2
|
||||
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-SDAG-NEXT: v_log_f32_e32 v5, v1
|
||||
; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2
|
||||
; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v5
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v3
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v2
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v3, v6, v3
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3
|
||||
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
|
||||
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
|
||||
; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; VI-GISEL-LABEL: s_log10_v2f32:
|
||||
|
||||
@ -221,8 +221,6 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
|
||||
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
|
||||
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
|
||||
; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-SDAG-NEXT: s_mov_b32 s6, -1
|
||||
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
|
||||
; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
|
||||
@ -238,11 +236,11 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
|
||||
; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s2, v1
|
||||
; SI-SDAG-NEXT: v_log_f32_e32 v3, v3
|
||||
; SI-SDAG-NEXT: v_log_f32_e32 v4, v1
|
||||
; SI-SDAG-NEXT: s_mov_b32 s4, s0
|
||||
; SI-SDAG-NEXT: s_mov_b32 s5, s1
|
||||
; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-SDAG-NEXT: s_mov_b32 s2, -1
|
||||
; SI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2
|
||||
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0
|
||||
; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; SI-GISEL-LABEL: s_log2_v2f32:
|
||||
@ -285,16 +283,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
|
||||
; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3
|
||||
; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v0, s2, v0
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
|
||||
; VI-SDAG-NEXT: v_log_f32_e32 v5, v0
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v5, v4
|
||||
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1
|
||||
; VI-SDAG-NEXT: v_log_f32_e32 v4, v1
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0
|
||||
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; VI-GISEL-LABEL: s_log2_v2f32:
|
||||
|
||||
@ -362,16 +362,15 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x
|
||||
; SI-LABEL: s_test_imax_sgt_imm_v2i32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_max_i32 s0, s3, 9
|
||||
; SI-NEXT: s_max_i32 s1, s2, 9
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: s_max_i32 s5, s5, 9
|
||||
; SI-NEXT: s_max_i32 s4, s4, 9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-LABEL: s_test_imax_sgt_imm_v2i32:
|
||||
@ -869,16 +868,15 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x
|
||||
; SI-LABEL: s_test_umax_ugt_imm_v2i32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_max_u32 s0, s3, 23
|
||||
; SI-NEXT: s_max_u32 s1, s2, 15
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: s_max_u32 s5, s5, 23
|
||||
; SI-NEXT: s_max_u32 s4, s4, 15
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-LABEL: s_test_umax_ugt_imm_v2i32:
|
||||
|
||||
@ -1074,118 +1074,118 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
|
||||
;
|
||||
; CI-LABEL: s_test_imin_sle_v4i16:
|
||||
; CI: ; %bb.0:
|
||||
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
|
||||
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
|
||||
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; CI-NEXT: s_add_i32 s12, s12, s17
|
||||
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: s_ashr_i32 s0, s2, 16
|
||||
; CI-NEXT: s_ashr_i32 s1, s3, 16
|
||||
; CI-NEXT: s_ashr_i32 s6, s0, 16
|
||||
; CI-NEXT: s_ashr_i32 s7, s1, 16
|
||||
; CI-NEXT: s_sext_i32_i16 s0, s0
|
||||
; CI-NEXT: s_sext_i32_i16 s1, s1
|
||||
; CI-NEXT: s_ashr_i32 s8, s2, 16
|
||||
; CI-NEXT: s_ashr_i32 s9, s3, 16
|
||||
; CI-NEXT: s_sext_i32_i16 s2, s2
|
||||
; CI-NEXT: s_sext_i32_i16 s3, s3
|
||||
; CI-NEXT: s_ashr_i32 s6, s4, 16
|
||||
; CI-NEXT: s_ashr_i32 s7, s5, 16
|
||||
; CI-NEXT: s_sext_i32_i16 s4, s4
|
||||
; CI-NEXT: s_sext_i32_i16 s5, s5
|
||||
; CI-NEXT: s_min_i32 s1, s1, s7
|
||||
; CI-NEXT: s_min_i32 s3, s3, s5
|
||||
; CI-NEXT: s_min_i32 s0, s0, s6
|
||||
; CI-NEXT: s_min_i32 s2, s2, s4
|
||||
; CI-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; CI-NEXT: s_and_b32 s3, s3, 0xffff
|
||||
; CI-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; CI-NEXT: s_and_b32 s2, s2, 0xffff
|
||||
; CI-NEXT: s_or_b32 s1, s3, s1
|
||||
; CI-NEXT: s_or_b32 s0, s2, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; CI-NEXT: s_min_i32 s7, s7, s9
|
||||
; CI-NEXT: s_min_i32 s1, s1, s3
|
||||
; CI-NEXT: s_min_i32 s3, s6, s8
|
||||
; CI-NEXT: s_min_i32 s0, s0, s2
|
||||
; CI-NEXT: s_lshl_b32 s7, s7, 16
|
||||
; CI-NEXT: s_and_b32 s1, s1, 0xffff
|
||||
; CI-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; CI-NEXT: s_and_b32 s0, s0, 0xffff
|
||||
; CI-NEXT: s_or_b32 s1, s1, s7
|
||||
; CI-NEXT: s_or_b32 s0, s0, s3
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: s_test_imin_sle_v4i16:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; VI-NEXT: s_add_i32 s12, s12, s17
|
||||
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_ashr_i32 s6, s3, 16
|
||||
; VI-NEXT: s_ashr_i32 s7, s1, 16
|
||||
; VI-NEXT: s_sext_i32_i16 s3, s3
|
||||
; VI-NEXT: s_sext_i32_i16 s1, s1
|
||||
; VI-NEXT: s_min_i32 s6, s7, s6
|
||||
; VI-NEXT: s_min_i32 s1, s1, s3
|
||||
; VI-NEXT: s_lshl_b32 s6, s6, 16
|
||||
; VI-NEXT: s_and_b32 s1, s1, 0xffff
|
||||
; VI-NEXT: s_or_b32 s1, s1, s6
|
||||
; VI-NEXT: s_ashr_i32 s3, s2, 16
|
||||
; VI-NEXT: s_ashr_i32 s6, s0, 16
|
||||
; VI-NEXT: s_sext_i32_i16 s2, s2
|
||||
; VI-NEXT: s_sext_i32_i16 s0, s0
|
||||
; VI-NEXT: s_min_i32 s3, s6, s3
|
||||
; VI-NEXT: s_min_i32 s0, s0, s2
|
||||
; VI-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; VI-NEXT: s_and_b32 s0, s0, 0xffff
|
||||
; VI-NEXT: s_or_b32 s0, s0, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_ashr_i32 s0, s5, 16
|
||||
; VI-NEXT: s_ashr_i32 s1, s3, 16
|
||||
; VI-NEXT: s_min_i32 s0, s1, s0
|
||||
; VI-NEXT: s_sext_i32_i16 s1, s5
|
||||
; VI-NEXT: s_sext_i32_i16 s3, s3
|
||||
; VI-NEXT: s_min_i32 s1, s3, s1
|
||||
; VI-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; VI-NEXT: s_and_b32 s1, s1, 0xffff
|
||||
; VI-NEXT: s_or_b32 s0, s1, s0
|
||||
; VI-NEXT: s_ashr_i32 s1, s4, 16
|
||||
; VI-NEXT: s_ashr_i32 s3, s2, 16
|
||||
; VI-NEXT: s_min_i32 s1, s3, s1
|
||||
; VI-NEXT: s_sext_i32_i16 s3, s4
|
||||
; VI-NEXT: s_sext_i32_i16 s2, s2
|
||||
; VI-NEXT: s_min_i32 s2, s2, s3
|
||||
; VI-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; VI-NEXT: s_and_b32 s2, s2, 0xffff
|
||||
; VI-NEXT: s_or_b32 s1, s2, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: s_test_imin_sle_v4i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX9-NEXT: v_pk_min_i16 v1, s3, v0
|
||||
; GFX9-NEXT: v_pk_min_i16 v0, s2, v3
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX9-NEXT: v_pk_min_i16 v1, s1, v0
|
||||
; GFX9-NEXT: v_pk_min_i16 v0, s0, v3
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: s_test_imin_sle_v4i16:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
|
||||
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_pk_min_i16 v1, s3, s5
|
||||
; GFX10-NEXT: v_pk_min_i16 v0, s2, s4
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: v_pk_min_i16 v1, s1, s3
|
||||
; GFX10-NEXT: v_pk_min_i16 v0, s0, s2
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: s_test_imin_sle_v4i16:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_pk_min_i16 v1, s3, s5
|
||||
; GFX11-NEXT: v_pk_min_i16 v0, s2, s4
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: v_pk_min_i16 v1, s1, s3
|
||||
; GFX11-NEXT: v_pk_min_i16 v0, s0, s2
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-LABEL: s_test_imin_sle_v4i16:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
|
||||
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
|
||||
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_pk_min_i16 v1, s3, s7
|
||||
; GFX1250-NEXT: v_pk_min_i16 v0, s2, s6
|
||||
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX1250-NEXT: v_pk_min_i16 v1, s1, s3
|
||||
; GFX1250-NEXT: v_pk_min_i16 v0, s0, s2
|
||||
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[6:7]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
%cmp = icmp sle <4 x i16> %a, %b
|
||||
%val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
|
||||
@ -1636,92 +1636,92 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
|
||||
;
|
||||
; CI-LABEL: s_test_imin_slt_v2i32:
|
||||
; CI: ; %bb.0:
|
||||
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
|
||||
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
|
||||
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; CI-NEXT: s_add_i32 s12, s12, s17
|
||||
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_min_i32 s1, s1, s3
|
||||
; CI-NEXT: s_min_i32 s0, s0, s2
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: s_min_i32 s0, s3, s5
|
||||
; CI-NEXT: s_min_i32 s1, s2, s4
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: s_test_imin_slt_v2i32:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; VI-NEXT: s_add_i32 s12, s12, s17
|
||||
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_min_i32 s1, s1, s3
|
||||
; VI-NEXT: s_min_i32 s0, s0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_min_i32 s0, s3, s5
|
||||
; VI-NEXT: s_min_i32 s1, s2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: s_test_imin_slt_v2i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_min_i32 s3, s3, s5
|
||||
; GFX9-NEXT: s_min_i32 s2, s2, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX9-NEXT: s_min_i32 s1, s1, s3
|
||||
; GFX9-NEXT: s_min_i32 s0, s0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: s_test_imin_slt_v2i32:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
|
||||
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_min_i32 s2, s2, s4
|
||||
; GFX10-NEXT: s_min_i32 s3, s3, s5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: s_min_i32 s0, s0, s2
|
||||
; GFX10-NEXT: s_min_i32 s1, s1, s3
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: s_test_imin_slt_v2i32:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_min_i32 s2, s2, s4
|
||||
; GFX11-NEXT: s_min_i32 s3, s3, s5
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: s_min_i32 s0, s0, s2
|
||||
; GFX11-NEXT: s_min_i32 s1, s1, s3
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-LABEL: s_test_imin_slt_v2i32:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
||||
; GFX1250-NEXT: s_clause 0x1
|
||||
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
|
||||
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
|
||||
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_min_i32 s2, s2, s6
|
||||
; GFX1250-NEXT: s_min_i32 s3, s3, s7
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX1250-NEXT: s_min_i32 s0, s0, s2
|
||||
; GFX1250-NEXT: s_min_i32 s1, s1, s3
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[6:7]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
%cmp = icmp slt <2 x i32> %a, %b
|
||||
%val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
|
||||
|
||||
@ -76,33 +76,19 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
|
||||
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; PACKED-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-SDAG-LABEL: fadd_v2_vs:
|
||||
; GFX1250-SDAG: ; %bb.0:
|
||||
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
||||
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
||||
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
||||
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
||||
; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
||||
; GFX1250-SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-GISEL-LABEL: fadd_v2_vs:
|
||||
; GFX1250-GISEL: ; %bb.0:
|
||||
; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
||||
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
||||
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
||||
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
||||
; GFX1250-GISEL-NEXT: s_endpgm
|
||||
; GFX1250-LABEL: fadd_v2_vs:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
||||
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
||||
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
|
||||
; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
||||
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
||||
@ -1377,33 +1363,19 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
|
||||
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; PACKED-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-SDAG-LABEL: fmul_v2_vs:
|
||||
; GFX1250-SDAG: ; %bb.0:
|
||||
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
||||
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
||||
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
||||
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
||||
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
|
||||
; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
||||
; GFX1250-SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-GISEL-LABEL: fmul_v2_vs:
|
||||
; GFX1250-GISEL: ; %bb.0:
|
||||
; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
||||
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
||||
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
|
||||
; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
||||
; GFX1250-GISEL-NEXT: s_endpgm
|
||||
; GFX1250-LABEL: fmul_v2_vs:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
||||
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
|
||||
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
|
||||
; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
|
||||
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
|
||||
@ -3568,8 +3540,8 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
|
||||
; GFX900-LABEL: fadd_fadd_fsub:
|
||||
; GFX900: ; %bb.0: ; %bb
|
||||
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX900-NEXT: v_add_f32_e32 v0, s1, v0
|
||||
@ -3577,14 +3549,14 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
|
||||
; GFX900-NEXT: v_add_f32_e32 v3, s2, v0
|
||||
; GFX900-NEXT: v_sub_f32_e32 v0, s0, v1
|
||||
; GFX900-NEXT: v_subrev_f32_e32 v1, s3, v3
|
||||
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
|
||||
; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX900-NEXT: s_endpgm
|
||||
;
|
||||
; PACKED-SDAG-LABEL: fadd_fadd_fsub:
|
||||
; PACKED-SDAG: ; %bb.0: ; %bb
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0
|
||||
; PACKED-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
||||
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s3
|
||||
; PACKED-SDAG-NEXT: v_add_f32_e32 v0, s1, v0
|
||||
@ -3592,7 +3564,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
|
||||
; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0
|
||||
; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, v0
|
||||
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1]
|
||||
; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
|
||||
; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
|
||||
; PACKED-SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GFX90A-GISEL-LABEL: fadd_fadd_fsub:
|
||||
|
||||
@ -94,64 +94,62 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
;
|
||||
; SI-LABEL: rotl_v2i32:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s6, s2
|
||||
; SI-NEXT: s_mov_b32 s7, s3
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_sub_i32 s5, 32, s5
|
||||
; SI-NEXT: s_sub_i32 s4, 32, s4
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s5
|
||||
; SI-NEXT: v_alignbit_b32 v1, s7, s7, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: v_alignbit_b32 v0, s6, s6, v0
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: s_sub_i32 s3, 32, s3
|
||||
; SI-NEXT: s_sub_i32 s2, 32, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: rotl_v2i32:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: s_sub_i32 s1, 32, s5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: s_sub_i32 s0, 32, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: v_alignbit_b32 v3, s3, s3, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v2
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GFX8-NEXT: s_sub_i32 s2, 32, s2
|
||||
; GFX8-NEXT: s_sub_i32 s3, 32, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0
|
||||
; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: rotl_v2i32:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_sub_i32 s4, 32, s7
|
||||
; GFX10-NEXT: s_sub_i32 s5, 32, s6
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s3, s3, s4
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s2, s2, s5
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: s_sub_i32 s3, 32, s3
|
||||
; GFX10-NEXT: s_sub_i32 s2, 32, s2
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: rotl_v2i32:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_sub_i32 s4, 32, s7
|
||||
; GFX11-NEXT: s_sub_i32 s5, 32, s6
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s3, s3, s4
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s2, s2, s5
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: s_sub_i32 s3, 32, s3
|
||||
; GFX11-NEXT: s_sub_i32 s2, 32, s2
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
%0 = shl <2 x i32> %x, %y
|
||||
|
||||
@ -83,56 +83,54 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
;
|
||||
; SI-LABEL: rotr_v2i32:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s6, s2
|
||||
; SI-NEXT: s_mov_b32 s7, s3
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s5
|
||||
; SI-NEXT: v_alignbit_b32 v1, s7, s7, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: v_alignbit_b32 v0, s6, s6, v0
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: rotr_v2i32:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: v_alignbit_b32 v3, s3, s3, v2
|
||||
; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v4
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0
|
||||
; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: rotr_v2i32:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s3, s3, s7
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s2, s2, s6
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: rotr_v2i32:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s3, s3, s5
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s2, s2, s4
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
|
||||
|
||||
@ -56,8 +56,8 @@ define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) #0 {
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}s_addk_v2i32_k0:
|
||||
; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
|
||||
; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
|
||||
; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
|
||||
; SI: s_endpgm
|
||||
; Note: dummy argument here to prevent combining of descriptor loads for %out and %b
|
||||
define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 x i32> %b) #0 {
|
||||
|
||||
@ -331,80 +331,79 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_sub_i32 s1, 0, s2
|
||||
; VI-NEXT: s_lshr_b32 s5, s2, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_sub_i32 s0, 0, s3
|
||||
; VI-NEXT: s_lshr_b32 s4, s3, 16
|
||||
; VI-NEXT: s_sub_i32 s5, 0, s5
|
||||
; VI-NEXT: s_ashr_i32 s6, s2, 16
|
||||
; VI-NEXT: s_sext_i32_i16 s1, s1
|
||||
; VI-NEXT: s_sext_i32_i16 s2, s2
|
||||
; VI-NEXT: s_sub_i32 s4, 0, s4
|
||||
; VI-NEXT: s_sext_i32_i16 s5, s5
|
||||
; VI-NEXT: s_max_i32 s1, s2, s1
|
||||
; VI-NEXT: s_sext_i32_i16 s0, s0
|
||||
; VI-NEXT: s_sext_i32_i16 s2, s3
|
||||
; VI-NEXT: s_max_i32 s5, s6, s5
|
||||
; VI-NEXT: s_ashr_i32 s6, s3, 16
|
||||
; VI-NEXT: s_lshr_b32 s7, s2, 16
|
||||
; VI-NEXT: s_sub_i32 s7, 0, s7
|
||||
; VI-NEXT: s_sub_i32 s4, 0, s3
|
||||
; VI-NEXT: s_lshr_b32 s6, s3, 16
|
||||
; VI-NEXT: s_ashr_i32 s8, s2, 16
|
||||
; VI-NEXT: s_sext_i32_i16 s7, s7
|
||||
; VI-NEXT: s_sub_i32 s5, 0, s2
|
||||
; VI-NEXT: s_sub_i32 s6, 0, s6
|
||||
; VI-NEXT: s_max_i32 s7, s8, s7
|
||||
; VI-NEXT: s_ashr_i32 s8, s3, 16
|
||||
; VI-NEXT: s_sext_i32_i16 s4, s4
|
||||
; VI-NEXT: s_max_i32 s0, s2, s0
|
||||
; VI-NEXT: s_max_i32 s4, s6, s4
|
||||
; VI-NEXT: s_add_i32 s0, s0, 2
|
||||
; VI-NEXT: s_lshl_b32 s2, s4, 16
|
||||
; VI-NEXT: s_and_b32 s0, s0, 0xffff
|
||||
; VI-NEXT: s_add_i32 s1, s1, 2
|
||||
; VI-NEXT: s_or_b32 s0, s2, s0
|
||||
; VI-NEXT: s_lshl_b32 s2, s5, 16
|
||||
; VI-NEXT: s_and_b32 s1, s1, 0xffff
|
||||
; VI-NEXT: s_or_b32 s1, s2, s1
|
||||
; VI-NEXT: s_add_i32 s0, s0, 0x20000
|
||||
; VI-NEXT: s_add_i32 s1, s1, 0x20000
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_sext_i32_i16 s3, s3
|
||||
; VI-NEXT: s_sext_i32_i16 s6, s6
|
||||
; VI-NEXT: s_sext_i32_i16 s5, s5
|
||||
; VI-NEXT: s_sext_i32_i16 s2, s2
|
||||
; VI-NEXT: s_max_i32 s3, s3, s4
|
||||
; VI-NEXT: s_max_i32 s6, s8, s6
|
||||
; VI-NEXT: s_max_i32 s2, s2, s5
|
||||
; VI-NEXT: s_add_i32 s3, s3, 2
|
||||
; VI-NEXT: s_lshl_b32 s4, s6, 16
|
||||
; VI-NEXT: s_and_b32 s3, s3, 0xffff
|
||||
; VI-NEXT: s_add_i32 s2, s2, 2
|
||||
; VI-NEXT: s_or_b32 s3, s4, s3
|
||||
; VI-NEXT: s_lshl_b32 s4, s7, 16
|
||||
; VI-NEXT: s_and_b32 s2, s2, 0xffff
|
||||
; VI-NEXT: s_or_b32 s2, s4, s2
|
||||
; VI-NEXT: s_add_i32 s3, s3, 0x20000
|
||||
; VI-NEXT: s_add_i32 s2, s2, 0x20000
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; CI-LABEL: s_abs_v4i16:
|
||||
; CI: ; %bb.0:
|
||||
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; CI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; CI-NEXT: s_mov_b32 s6, -1
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_mov_b32 s4, s0
|
||||
; CI-NEXT: s_mov_b32 s5, s1
|
||||
; CI-NEXT: s_ashr_i32 s0, s3, 16
|
||||
; CI-NEXT: s_ashr_i32 s1, s2, 16
|
||||
; CI-NEXT: s_lshr_b32 s8, s2, 16
|
||||
; CI-NEXT: s_lshr_b32 s9, s3, 16
|
||||
; CI-NEXT: s_sext_i32_i16 s10, s3
|
||||
; CI-NEXT: s_sext_i32_i16 s11, s2
|
||||
; CI-NEXT: s_sub_i32 s3, 0, s3
|
||||
; CI-NEXT: s_sub_i32 s2, 0, s2
|
||||
; CI-NEXT: s_sext_i32_i16 s3, s3
|
||||
; CI-NEXT: s_sext_i32_i16 s2, s2
|
||||
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; CI-NEXT: s_ashr_i32 s6, s5, 16
|
||||
; CI-NEXT: s_lshr_b32 s9, s5, 16
|
||||
; CI-NEXT: s_sext_i32_i16 s10, s5
|
||||
; CI-NEXT: s_sub_i32 s5, 0, s5
|
||||
; CI-NEXT: s_ashr_i32 s7, s4, 16
|
||||
; CI-NEXT: s_lshr_b32 s8, s4, 16
|
||||
; CI-NEXT: s_sext_i32_i16 s11, s4
|
||||
; CI-NEXT: s_sext_i32_i16 s5, s5
|
||||
; CI-NEXT: s_sub_i32 s4, 0, s4
|
||||
; CI-NEXT: s_sub_i32 s9, 0, s9
|
||||
; CI-NEXT: s_sub_i32 s8, 0, s8
|
||||
; CI-NEXT: s_sext_i32_i16 s4, s4
|
||||
; CI-NEXT: s_sext_i32_i16 s9, s9
|
||||
; CI-NEXT: s_sub_i32 s8, 0, s8
|
||||
; CI-NEXT: s_max_i32 s5, s10, s5
|
||||
; CI-NEXT: s_sext_i32_i16 s8, s8
|
||||
; CI-NEXT: s_max_i32 s2, s11, s2
|
||||
; CI-NEXT: s_max_i32 s3, s10, s3
|
||||
; CI-NEXT: s_max_i32 s1, s1, s8
|
||||
; CI-NEXT: s_max_i32 s0, s0, s9
|
||||
; CI-NEXT: s_add_i32 s3, s3, 2
|
||||
; CI-NEXT: s_add_i32 s2, s2, 2
|
||||
; CI-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; CI-NEXT: s_and_b32 s3, s3, 0xffff
|
||||
; CI-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; CI-NEXT: s_and_b32 s2, s2, 0xffff
|
||||
; CI-NEXT: s_or_b32 s0, s0, s3
|
||||
; CI-NEXT: s_or_b32 s1, s1, s2
|
||||
; CI-NEXT: s_add_i32 s0, s0, 0x20000
|
||||
; CI-NEXT: s_add_i32 s1, s1, 0x20000
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; CI-NEXT: s_max_i32 s6, s6, s9
|
||||
; CI-NEXT: s_max_i32 s4, s11, s4
|
||||
; CI-NEXT: s_add_i32 s5, s5, 2
|
||||
; CI-NEXT: s_max_i32 s7, s7, s8
|
||||
; CI-NEXT: s_lshl_b32 s6, s6, 16
|
||||
; CI-NEXT: s_and_b32 s5, s5, 0xffff
|
||||
; CI-NEXT: s_add_i32 s4, s4, 2
|
||||
; CI-NEXT: s_or_b32 s5, s6, s5
|
||||
; CI-NEXT: s_lshl_b32 s6, s7, 16
|
||||
; CI-NEXT: s_and_b32 s4, s4, 0xffff
|
||||
; CI-NEXT: s_or_b32 s4, s6, s4
|
||||
; CI-NEXT: s_add_i32 s5, s5, 0x20000
|
||||
; CI-NEXT: s_add_i32 s4, s4, 0x20000
|
||||
; CI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; CI-NEXT: s_mov_b32 s2, -1
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; CI-NEXT: s_endpgm
|
||||
%z0 = insertelement <4 x i16> poison, i16 0, i16 0
|
||||
%z1 = insertelement <4 x i16> %z0, i16 0, i16 1
|
||||
|
||||
@ -136,8 +136,7 @@ define amdgpu_kernel void @store_as4_2xi32(ptr addrspace(4) %p, <2 x i32> %v) {
|
||||
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, s2
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, s3
|
||||
; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
|
||||
; CHECK-NEXT: s_endpgm
|
||||
store <2 x i32> %v, ptr addrspace(4) %p
|
||||
@ -164,8 +163,7 @@ define amdgpu_kernel void @store_as4_2xfloat(ptr addrspace(4) %p, <2 x float> %v
|
||||
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, s2
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, s3
|
||||
; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
|
||||
; CHECK-NEXT: s_endpgm
|
||||
store <2 x float> %v, ptr addrspace(4) %p
|
||||
|
||||
@ -164,102 +164,98 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
|
||||
;
|
||||
; GFX6-LABEL: test_udivrem_v2:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
||||
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
|
||||
; GFX6-NEXT: s_sub_i32 s0, 0, s8
|
||||
; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9
|
||||
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2
|
||||
; GFX6-NEXT: s_sub_i32 s6, 0, s2
|
||||
; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3
|
||||
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
|
||||
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
|
||||
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0
|
||||
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0
|
||||
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_mov_b32 s5, s1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
||||
; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
|
||||
; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0
|
||||
; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
|
||||
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
|
||||
; GFX6-NEXT: s_mul_i32 s4, s4, s8
|
||||
; GFX6-NEXT: s_sub_i32 s2, s2, s4
|
||||
; GFX6-NEXT: s_sub_i32 s4, s2, s8
|
||||
; GFX6-NEXT: s_cmp_ge_u32 s2, s8
|
||||
; GFX6-NEXT: s_cselect_b32 s2, s4, s2
|
||||
; GFX6-NEXT: s_sub_i32 s4, s2, s8
|
||||
; GFX6-NEXT: s_cmp_ge_u32 s2, s8
|
||||
; GFX6-NEXT: s_cselect_b32 s2, s4, s2
|
||||
; GFX6-NEXT: s_sub_i32 s4, 0, s9
|
||||
; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1
|
||||
; GFX6-NEXT: s_mov_b32 s4, s0
|
||||
; GFX6-NEXT: v_readfirstlane_b32 s6, v0
|
||||
; GFX6-NEXT: s_mul_i32 s6, s6, s2
|
||||
; GFX6-NEXT: s_sub_i32 s0, s0, s6
|
||||
; GFX6-NEXT: s_sub_i32 s6, s0, s2
|
||||
; GFX6-NEXT: s_cmp_ge_u32 s0, s2
|
||||
; GFX6-NEXT: s_cselect_b32 s0, s6, s0
|
||||
; GFX6-NEXT: s_sub_i32 s6, s0, s2
|
||||
; GFX6-NEXT: s_cmp_ge_u32 s0, s2
|
||||
; GFX6-NEXT: s_cselect_b32 s0, s6, s0
|
||||
; GFX6-NEXT: s_sub_i32 s2, 0, s3
|
||||
; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1
|
||||
; GFX6-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
|
||||
; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
|
||||
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX6-NEXT: s_mul_i32 s0, s0, s9
|
||||
; GFX6-NEXT: s_sub_i32 s0, s3, s0
|
||||
; GFX6-NEXT: s_sub_i32 s1, s0, s9
|
||||
; GFX6-NEXT: s_cmp_ge_u32 s0, s9
|
||||
; GFX6-NEXT: s_cselect_b32 s0, s1, s0
|
||||
; GFX6-NEXT: s_sub_i32 s1, s0, s9
|
||||
; GFX6-NEXT: s_cmp_ge_u32 s0, s9
|
||||
; GFX6-NEXT: s_cselect_b32 s0, s1, s0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0
|
||||
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX6-NEXT: s_mul_i32 s2, s2, s3
|
||||
; GFX6-NEXT: s_sub_i32 s1, s1, s2
|
||||
; GFX6-NEXT: s_sub_i32 s2, s1, s3
|
||||
; GFX6-NEXT: s_cmp_ge_u32 s1, s3
|
||||
; GFX6-NEXT: s_cselect_b32 s1, s2, s1
|
||||
; GFX6-NEXT: s_sub_i32 s2, s1, s3
|
||||
; GFX6-NEXT: s_cmp_ge_u32 s1, s3
|
||||
; GFX6-NEXT: s_cselect_b32 s1, s2, s1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: test_udivrem_v2:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6
|
||||
; GFX8-NEXT: s_sub_i32 s0, 0, s6
|
||||
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s7
|
||||
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
|
||||
; GFX8-NEXT: s_sub_i32 s6, 0, s2
|
||||
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s3
|
||||
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
|
||||
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
|
||||
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0
|
||||
; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
|
||||
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
|
||||
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
|
||||
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
|
||||
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
|
||||
; GFX8-NEXT: s_mul_i32 s4, s4, s6
|
||||
; GFX8-NEXT: s_sub_i32 s2, s2, s4
|
||||
; GFX8-NEXT: s_sub_i32 s4, s2, s6
|
||||
; GFX8-NEXT: s_cmp_ge_u32 s2, s6
|
||||
; GFX8-NEXT: s_cselect_b32 s2, s4, s2
|
||||
; GFX8-NEXT: s_sub_i32 s4, s2, s6
|
||||
; GFX8-NEXT: s_cmp_ge_u32 s2, s6
|
||||
; GFX8-NEXT: s_cselect_b32 s2, s4, s2
|
||||
; GFX8-NEXT: s_sub_i32 s4, 0, s7
|
||||
; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s6, v0
|
||||
; GFX8-NEXT: s_mul_i32 s6, s6, s2
|
||||
; GFX8-NEXT: s_sub_i32 s0, s0, s6
|
||||
; GFX8-NEXT: s_sub_i32 s6, s0, s2
|
||||
; GFX8-NEXT: s_cmp_ge_u32 s0, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s0, s6, s0
|
||||
; GFX8-NEXT: s_sub_i32 s6, s0, s2
|
||||
; GFX8-NEXT: s_cmp_ge_u32 s0, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s0, s6, s0
|
||||
; GFX8-NEXT: s_sub_i32 s2, 0, s3
|
||||
; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
|
||||
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
|
||||
; GFX8-NEXT: v_mul_hi_u32 v2, s3, v0
|
||||
; GFX8-NEXT: v_mul_hi_u32 v1, s1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s0, v2
|
||||
; GFX8-NEXT: s_mul_i32 s0, s0, s7
|
||||
; GFX8-NEXT: s_sub_i32 s0, s3, s0
|
||||
; GFX8-NEXT: s_sub_i32 s1, s0, s7
|
||||
; GFX8-NEXT: s_cmp_ge_u32 s0, s7
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s0, v1
|
||||
; GFX8-NEXT: s_mul_i32 s0, s0, s3
|
||||
; GFX8-NEXT: s_sub_i32 s0, s1, s0
|
||||
; GFX8-NEXT: s_sub_i32 s1, s0, s3
|
||||
; GFX8-NEXT: s_cmp_ge_u32 s0, s3
|
||||
; GFX8-NEXT: s_cselect_b32 s0, s1, s0
|
||||
; GFX8-NEXT: s_sub_i32 s1, s0, s7
|
||||
; GFX8-NEXT: s_cmp_ge_u32 s0, s7
|
||||
; GFX8-NEXT: s_sub_i32 s1, s0, s3
|
||||
; GFX8-NEXT: s_cmp_ge_u32 s0, s3
|
||||
; GFX8-NEXT: s_cselect_b32 s0, s1, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
%result0 = udiv <2 x i32> %x, %y
|
||||
store <2 x i32> %result0, ptr addrspace(1) %out
|
||||
|
||||
@ -340,8 +340,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3
|
||||
; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GCN-NEXT: s_endpgm
|
||||
;
|
||||
|
||||
@ -1,215 +0,0 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
|
||||
; RUN: opt -S -passes=instcombine %s | FileCheck %s
|
||||
|
||||
@test.data = private unnamed_addr constant [8 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7], align 4
|
||||
@test.ptrdata = private unnamed_addr constant [8 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null], align 8
|
||||
|
||||
; Verify that InstCombine copies range metadata when cloning a load as part of
|
||||
; replacing an alloca initialized via memcpy from a constant. OK
|
||||
define i32 @copy_range_metadata_after_memcpy(i64 %x) {
|
||||
; CHECK-LABEL: define i32 @copy_range_metadata_after_memcpy(
|
||||
; CHECK-SAME: i64 [[X:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
||||
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !range [[RNG0:![0-9]+]]
|
||||
; CHECK-NEXT: ret i32 [[L]]
|
||||
;
|
||||
entry:
|
||||
%data = alloca [8 x i32], align 4
|
||||
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
||||
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
||||
%l = load i32, ptr %arrayidx, align 4, !range !0
|
||||
ret i32 %l
|
||||
}
|
||||
|
||||
declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1)
|
||||
|
||||
!0 = !{i32 0, i32 100}
|
||||
|
||||
; Verify TBAA metadata on a cloned load is preserved. OK
|
||||
define i32 @copy_tbaa_metadata_after_memcpy(i64 %x, ptr %sink) {
|
||||
; CHECK-LABEL: define i32 @copy_tbaa_metadata_after_memcpy(
|
||||
; CHECK-SAME: i64 [[X:%.*]], ptr [[SINK:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
||||
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[SCALAR_TYPE_TBAA1:![0-9]+]]
|
||||
; CHECK-NEXT: store i32 [[L]], ptr [[SINK]], align 4
|
||||
; CHECK-NEXT: ret i32 [[L]]
|
||||
;
|
||||
entry:
|
||||
%data = alloca [8 x i32], align 4
|
||||
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
||||
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
||||
%l = load i32, ptr %arrayidx, align 4, !tbaa !1
|
||||
store i32 %l, ptr %sink, align 4
|
||||
ret i32 %l
|
||||
}
|
||||
|
||||
!1 = !{!2, !2, i64 0}
|
||||
!2 = !{!"scalar type", !3}
|
||||
!3 = !{!"root"}
|
||||
|
||||
; Verify dereferenceable_or_null metadata on a cloned load is preserved
|
||||
; when the loaded value type is a pointer. OK
|
||||
define ptr @copy_deref_or_null_metadata_after_memcpy(i64 %x) {
|
||||
; CHECK-LABEL: define ptr @copy_deref_or_null_metadata_after_memcpy(
|
||||
; CHECK-SAME: i64 [[X:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: ret ptr null
|
||||
;
|
||||
entry:
|
||||
%data = alloca [8 x ptr], align 8
|
||||
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %data, ptr align 8 @test.ptrdata, i64 64, i1 false)
|
||||
%arrayidx = getelementptr inbounds [8 x ptr], ptr %data, i64 0, i64 %x
|
||||
%l = load ptr, ptr %arrayidx, align 8, !dereferenceable_or_null !4
|
||||
ret ptr %l
|
||||
}
|
||||
|
||||
!4 = !{i64 8}
|
||||
|
||||
; Verify nonnull metadata on a cloned load is preserved
|
||||
; when the loaded value type is a pointer. OK
|
||||
define ptr @copy_nonnull_metadata_after_memcpy(i64 %x) {
|
||||
; CHECK-LABEL: define ptr @copy_nonnull_metadata_after_memcpy(
|
||||
; CHECK-SAME: i64 [[X:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: ret ptr null
|
||||
;
|
||||
entry:
|
||||
%data = alloca [8 x ptr], align 8
|
||||
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %data, ptr align 8 @test.ptrdata, i64 64, i1 false)
|
||||
%arrayidx = getelementptr inbounds [8 x ptr], ptr %data, i64 0, i64 %x
|
||||
%l = load ptr, ptr %arrayidx, align 8, !nonnull !5
|
||||
ret ptr %l
|
||||
}
|
||||
|
||||
!5 = !{}
|
||||
|
||||
; Verify invariant.load metadata on a cloned load is preserved. OK
|
||||
define i32 @copy_invariant_load_metadata_after_memcpy(i64 %x) {
|
||||
; CHECK-LABEL: define i32 @copy_invariant_load_metadata_after_memcpy(
|
||||
; CHECK-SAME: i64 [[X:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
||||
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !invariant.load [[META4:![0-9]+]]
|
||||
; CHECK-NEXT: ret i32 [[L]]
|
||||
;
|
||||
entry:
|
||||
%data = alloca [8 x i32], align 4
|
||||
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
||||
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
||||
%l = load i32, ptr %arrayidx, align 4, !invariant.load !5
|
||||
ret i32 %l
|
||||
}
|
||||
|
||||
; Verify alias.scope and noalias metadata on a cloned load are preserved. OK
|
||||
define i32 @copy_aliasscope_noalias_metadata_after_memcpy(i64 %x) {
|
||||
; CHECK-LABEL: define i32 @copy_aliasscope_noalias_metadata_after_memcpy(
|
||||
; CHECK-SAME: i64 [[X:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
||||
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META5]]
|
||||
; CHECK-NEXT: ret i32 [[L]]
|
||||
;
|
||||
entry:
|
||||
%data = alloca [8 x i32], align 4
|
||||
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
||||
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
||||
%l = load i32, ptr %arrayidx, align 4, !alias.scope !6, !noalias !6
|
||||
ret i32 %l
|
||||
}
|
||||
|
||||
; Verify nontemporal metadata on a cloned load is preserved.OK
|
||||
define i32 @copy_nontemporal_metadata_after_memcpy(i64 %x) {
|
||||
; CHECK-LABEL: define i32 @copy_nontemporal_metadata_after_memcpy(
|
||||
; CHECK-SAME: i64 [[X:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
||||
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !nontemporal [[META8:![0-9]+]]
|
||||
; CHECK-NEXT: ret i32 [[L]]
|
||||
;
|
||||
entry:
|
||||
%data = alloca [8 x i32], align 4
|
||||
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
||||
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
||||
%l = load i32, ptr %arrayidx, align 4, !nontemporal !9
|
||||
ret i32 %l
|
||||
}
|
||||
|
||||
; Verify access group metadata on a cloned load is preserved. OK
|
||||
define i32 @copy_access_group_metadata_after_memcpy(i64 %x) {
|
||||
; CHECK-LABEL: define i32 @copy_access_group_metadata_after_memcpy(
|
||||
; CHECK-SAME: i64 [[X:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
||||
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP9:![0-9]+]]
|
||||
; CHECK-NEXT: ret i32 [[L]]
|
||||
;
|
||||
entry:
|
||||
%data = alloca [8 x i32], align 4
|
||||
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
||||
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
||||
%l = load i32, ptr %arrayidx, align 4, !llvm.access.group !10
|
||||
ret i32 %l
|
||||
}
|
||||
|
||||
; Verify noalias.addrspace metadata on a cloned load is preserved.
|
||||
define i32 @copy_noalias_addrspace_metadata_after_memcpy(i64 %x) {
|
||||
; CHECK-LABEL: define i32 @copy_noalias_addrspace_metadata_after_memcpy(
|
||||
; CHECK-SAME: i64 [[X:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
||||
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !noalias.addrspace [[META10:![0-9]+]]
|
||||
; CHECK-NEXT: ret i32 [[L]]
|
||||
;
|
||||
entry:
|
||||
%data = alloca [8 x i32], align 4
|
||||
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
||||
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
||||
%l = load i32, ptr %arrayidx, align 4, !noalias.addrspace !12
|
||||
ret i32 %l
|
||||
}
|
||||
|
||||
; Verify llvm.mem.parallel_loop_access metadata on a cloned load is preserved. OK
|
||||
define i32 @copy_mem_parallel_loop_access_metadata_after_memcpy(i64 %x) {
|
||||
; CHECK-LABEL: define i32 @copy_mem_parallel_loop_access_metadata_after_memcpy(
|
||||
; CHECK-SAME: i64 [[X:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
||||
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.mem.parallel_loop_access [[META11:![0-9]+]]
|
||||
; CHECK-NEXT: ret i32 [[L]]
|
||||
;
|
||||
entry:
|
||||
%data = alloca [8 x i32], align 4
|
||||
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
||||
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
||||
%l = load i32, ptr %arrayidx, align 4, !llvm.mem.parallel_loop_access !13
|
||||
ret i32 %l
|
||||
}
|
||||
|
||||
!6 = !{!7}
|
||||
!7 = distinct !{!7, !8}
|
||||
!8 = distinct !{!8}
|
||||
!9 = !{i32 1}
|
||||
!10 = distinct !{}
|
||||
!12 = !{i32 5, i32 6}
|
||||
!13 = !{!14}
|
||||
!14 = distinct !{}
|
||||
|
||||
|
||||
|
||||
;.
|
||||
; CHECK: [[RNG0]] = !{i32 0, i32 100}
|
||||
; CHECK: [[SCALAR_TYPE_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
|
||||
; CHECK: [[META2]] = !{!"scalar type", [[META3:![0-9]+]]}
|
||||
; CHECK: [[META3]] = !{!"root"}
|
||||
; CHECK: [[META4]] = !{}
|
||||
; CHECK: [[META5]] = !{[[META6:![0-9]+]]}
|
||||
; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]}
|
||||
; CHECK: [[META7]] = distinct !{[[META7]]}
|
||||
; CHECK: [[META8]] = !{i32 1}
|
||||
; CHECK: [[ACC_GRP9]] = distinct !{}
|
||||
; CHECK: [[META10]] = !{i32 5, i32 6}
|
||||
; CHECK: [[META11]] = !{[[META12:![0-9]+]]}
|
||||
; CHECK: [[META12]] = distinct !{}
|
||||
;.
|
||||
@ -1,159 +0,0 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
|
||||
|
||||
; We expect the merged vector load to retain nontemporal and tbaa, and normalization to handle
|
||||
; other load-only metadata.
|
||||
define void @lsv_copy_load_metadata(ptr %p) {
|
||||
; CHECK-LABEL: define void @lsv_copy_load_metadata(
|
||||
; CHECK-SAME: ptr [[P:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4, !tbaa [[CHAR_TBAA0:![0-9]+]], !invariant.load [[META3:![0-9]+]], !nontemporal [[META4:![0-9]+]]
|
||||
; CHECK-NEXT: [[LD01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
|
||||
; CHECK-NEXT: [[LD1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
|
||||
; CHECK-NEXT: [[LD1_MUT_BC:%.*]] = bitcast i32 [[LD1_MUT2]] to <2 x i16>
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%p1 = getelementptr i32, ptr %p, i64 1
|
||||
%ld0 = load i32, ptr %p, align 4, !tbaa !0, !nontemporal !5, !invariant.load !6
|
||||
%ld1 = load <2 x i16>, ptr %p1, align 4, !tbaa !0, !nontemporal !5, !invariant.load !6
|
||||
ret void
|
||||
}
|
||||
|
||||
; Check that metadata on stores is preserved when LSV normalizes mixed-typed
|
||||
; chains (exercises copyMetadataForAccess on stores).
|
||||
define void @lsv_copy_store_metadata(ptr %p) {
|
||||
; CHECK-LABEL: define void @lsv_copy_store_metadata(
|
||||
; CHECK-SAME: ptr [[P:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: store <2 x i32> <i32 7, i32 bitcast (<2 x i16> <i16 4, i16 5> to i32)>, ptr [[P]], align 4, !nontemporal [[META4]]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%p1 = getelementptr i32, ptr %p, i64 1
|
||||
store i32 7, ptr %p, align 4, !nontemporal !5
|
||||
store <2 x i16> <i16 4, i16 5>, ptr %p1, align 4, !nontemporal !5
|
||||
ret void
|
||||
}
|
||||
|
||||
; Copy alias.scope and noalias metadata on vectorized stores.
|
||||
define void @lsv_copy_store_alias_metadata(ptr %p) {
|
||||
; CHECK-LABEL: define void @lsv_copy_store_alias_metadata(
|
||||
; CHECK-SAME: ptr [[P:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: store <2 x i32> <i32 1, i32 bitcast (<2 x i16> <i16 2, i16 3> to i32)>, ptr [[P]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META5]]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%p1 = getelementptr i32, ptr %p, i64 1
|
||||
store i32 1, ptr %p, align 4, !alias.scope !11, !noalias !11
|
||||
store <2 x i16> <i16 2, i16 3>, ptr %p1, align 4, !alias.scope !11, !noalias !11
|
||||
ret void
|
||||
}
|
||||
|
||||
; Copy access group metadata on vectorized stores.
|
||||
define void @lsv_copy_store_access_group(ptr %p) {
|
||||
; CHECK-LABEL: define void @lsv_copy_store_access_group(
|
||||
; CHECK-SAME: ptr [[P:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: store <2 x i32> <i32 9, i32 bitcast (<2 x i16> <i16 8, i16 7> to i32)>, ptr [[P]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%p1 = getelementptr i32, ptr %p, i64 1
|
||||
store i32 9, ptr %p, align 4, !llvm.access.group !14
|
||||
store <2 x i16> <i16 8, i16 7>, ptr %p1, align 4, !llvm.access.group !14
|
||||
ret void
|
||||
}
|
||||
|
||||
; Copy noundef metadata on vectorized stores.
|
||||
define void @lsv_copy_store_noundef(ptr %p) {
|
||||
; CHECK-LABEL: define void @lsv_copy_store_noundef(
|
||||
; CHECK-SAME: ptr [[P:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: store <2 x i32> <i32 42, i32 bitcast (<2 x i16> <i16 6, i16 5> to i32)>, ptr [[P]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%p1 = getelementptr i32, ptr %p, i64 1
|
||||
store i32 42, ptr %p, align 4, !noundef !15
|
||||
store <2 x i16> <i16 6, i16 5>, ptr %p1, align 4, !noundef !15
|
||||
ret void
|
||||
}
|
||||
|
||||
; Copy noalias.addrspace metadata on vectorized stores.
|
||||
define void @lsv_copy_store_noalias_addrspace(ptr %p) {
|
||||
; CHECK-LABEL: define void @lsv_copy_store_noalias_addrspace(
|
||||
; CHECK-SAME: ptr [[P:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: store <2 x i32> <i32 11, i32 bitcast (<2 x i16> <i16 10, i16 9> to i32)>, ptr [[P]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%p1 = getelementptr i32, ptr %p, i64 1
|
||||
store i32 11, ptr %p, align 4, !noalias.addrspace !16
|
||||
store <2 x i16> <i16 10, i16 9>, ptr %p1, align 4, !noalias.addrspace !16
|
||||
ret void
|
||||
}
|
||||
|
||||
; Copy llvm.mem.parallel_loop_access metadata on vectorized stores.
|
||||
define void @lsv_copy_store_mem_parallel_loop_access(ptr %p) {
|
||||
; CHECK-LABEL: define void @lsv_copy_store_mem_parallel_loop_access(
|
||||
; CHECK-SAME: ptr [[P:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: store <2 x i32> <i32 13, i32 bitcast (<2 x i16> <i16 12, i16 11> to i32)>, ptr [[P]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%p1 = getelementptr i32, ptr %p, i64 1
|
||||
store i32 13, ptr %p, align 4, !llvm.mem.parallel_loop_access !17
|
||||
store <2 x i16> <i16 12, i16 11>, ptr %p1, align 4, !llvm.mem.parallel_loop_access !17
|
||||
ret void
|
||||
}
|
||||
|
||||
; Normalized type is not a pointer in the following test, avoid copying
|
||||
; dereferenceable_or_null metadata.
|
||||
define void @lsv_no_copy_deref_or_null(ptr %p) {
|
||||
; CHECK-LABEL: define void @lsv_no_copy_deref_or_null(
|
||||
; CHECK-SAME: ptr [[P:%.*]]) {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[P]], align 8
|
||||
; CHECK-NEXT: [[LD0_MUT1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
|
||||
; CHECK-NEXT: [[LD12:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
|
||||
; CHECK-NEXT: [[LD0_MUT_BC:%.*]] = inttoptr i64 [[LD0_MUT1]] to ptr
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%p1 = getelementptr i32, ptr %p, i64 1
|
||||
%ld0 = load ptr, ptr %p, align 4, !dereferenceable_or_null !7
|
||||
%ld1 = load i64, ptr %p1, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
!0 = !{!3, !3, i64 0}
|
||||
!3 = !{!"omnipotent char", !4, i64 0}
|
||||
!4 = !{!"Simple C/C++ TBAA"}
|
||||
!5 = !{i32 1}
|
||||
!6 = !{}
|
||||
!7 = !{i64 8}
|
||||
!8 = !{i64 1, i64 256}
|
||||
!11 = !{!12}
|
||||
!12 = distinct !{!12, !13}
|
||||
!13 = distinct !{!13}
|
||||
!14 = distinct !{}
|
||||
!15 = !{}
|
||||
!16 = !{i32 5, i32 6}
|
||||
!17 = !{!18}
|
||||
!18 = distinct !{}
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
||||
;.
|
||||
; CHECK: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
|
||||
; CHECK: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0}
|
||||
; CHECK: [[META2]] = !{!"Simple C/C++ TBAA"}
|
||||
; CHECK: [[META3]] = !{}
|
||||
; CHECK: [[META4]] = !{i32 1}
|
||||
; CHECK: [[META5]] = !{[[META6:![0-9]+]]}
|
||||
; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]}
|
||||
; CHECK: [[META7]] = distinct !{[[META7]]}
|
||||
;.
|
||||
@ -1,273 +1,57 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
|
||||
|
||||
define void @no_merge_i16_half(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @no_merge_i16_half(
|
||||
define void @merge_i32_v2i16_f32_v4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @merge_i32_v2i16_f32_v4i8(
|
||||
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
|
||||
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[PTR1]], i64 1
|
||||
; CHECK-NEXT: [[LOAD_0:%.*]] = load i16, ptr addrspace(1) [[PTR1]], align 2
|
||||
; CHECK-NEXT: [[LOAD_1:%.*]] = load half, ptr addrspace(1) [[GEP_1]], align 2
|
||||
; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i16, ptr addrspace(2) [[PTR2]], i64 1
|
||||
; CHECK-NEXT: store i16 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 2
|
||||
; CHECK-NEXT: store half [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 2
|
||||
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 0
|
||||
; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4
|
||||
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(1) [[PTR1]], i64 1
|
||||
; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP2]], align 4
|
||||
; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 2
|
||||
; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr addrspace(1) [[GEP3]], align 4
|
||||
; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(1) [[PTR1]], i64 3
|
||||
; CHECK-NEXT: [[LOAD4:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP4]], align 4
|
||||
; CHECK-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0
|
||||
; CHECK-NEXT: store i32 [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4
|
||||
; CHECK-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(2) [[PTR2]], i64 1
|
||||
; CHECK-NEXT: store <2 x i16> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4
|
||||
; CHECK-NEXT: [[STORE_GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(2) [[PTR2]], i64 2
|
||||
; CHECK-NEXT: store float [[LOAD3]], ptr addrspace(2) [[STORE_GEP3]], align 4
|
||||
; CHECK-NEXT: [[STORE_GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(2) [[PTR2]], i64 3
|
||||
; CHECK-NEXT: store <4 x i8> [[LOAD4]], ptr addrspace(2) [[STORE_GEP4]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%gep.1 = getelementptr inbounds i16, ptr addrspace(1) %ptr1, i64 1
|
||||
%load.0 = load i16, ptr addrspace(1) %ptr1
|
||||
%load.1 = load half, ptr addrspace(1) %gep.1
|
||||
%store.gep.1 = getelementptr inbounds i16, ptr addrspace(2) %ptr2, i64 1
|
||||
store i16 %load.0, ptr addrspace(2) %ptr2
|
||||
store half %load.1, ptr addrspace(2) %store.gep.1
|
||||
%gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 0
|
||||
%load1 = load i32, ptr addrspace(1) %gep1, align 4
|
||||
%gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %ptr1, i64 1
|
||||
%load2 = load <2 x i16>, ptr addrspace(1) %gep2, align 4
|
||||
%gep3 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 2
|
||||
%load3 = load float, ptr addrspace(1) %gep3, align 4
|
||||
%gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(1) %ptr1, i64 3
|
||||
%load4 = load <4 x i8>, ptr addrspace(1) %gep4, align 4
|
||||
%store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0
|
||||
store i32 %load1, ptr addrspace(2) %store.gep1, align 4
|
||||
%store.gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(2) %ptr2, i64 1
|
||||
store <2 x i16> %load2, ptr addrspace(2) %store.gep2, align 4
|
||||
%store.gep3 = getelementptr inbounds float, ptr addrspace(2) %ptr2, i64 2
|
||||
store float %load3, ptr addrspace(2) %store.gep3, align 4
|
||||
%store.gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(2) %ptr2, i64 3
|
||||
store <4 x i8> %load4, ptr addrspace(2) %store.gep4, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @no_merge_i16_float(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @no_merge_i16_float(
|
||||
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
|
||||
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[PTR1]], i64 1
|
||||
; CHECK-NEXT: [[LOAD_0:%.*]] = load i16, ptr addrspace(1) [[PTR1]], align 2
|
||||
; CHECK-NEXT: [[LOAD_1:%.*]] = load float, ptr addrspace(1) [[GEP_1]], align 4
|
||||
; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i16, ptr addrspace(2) [[PTR2]], i64 1
|
||||
; CHECK-NEXT: store i16 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 2
|
||||
; CHECK-NEXT: store float [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%gep.1 = getelementptr inbounds i16, ptr addrspace(1) %ptr1, i64 1
|
||||
%load.0 = load i16, ptr addrspace(1) %ptr1
|
||||
%load.1 = load float, ptr addrspace(1) %gep.1
|
||||
%store.gep.1 = getelementptr inbounds i16, ptr addrspace(2) %ptr2, i64 1
|
||||
store i16 %load.0, ptr addrspace(2) %ptr2
|
||||
store float %load.1, ptr addrspace(2) %store.gep.1
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @merge_i32_v2i16(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @merge_i32_v2i16(
|
||||
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[PTR1]], align 4
|
||||
; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
|
||||
; CHECK-NEXT: [[LOAD_1_MUT2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
|
||||
; CHECK-NEXT: [[LOAD_1_MUT_BC:%.*]] = bitcast i32 [[LOAD_1_MUT2]] to <2 x i16>
|
||||
; CHECK-NEXT: [[LOAD_1_BC:%.*]] = bitcast <2 x i16> [[LOAD_1_MUT_BC]] to i32
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_01]], i32 0
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOAD_1_BC]], i32 1
|
||||
; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(2) [[PTR2]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1
|
||||
%load.0 = load i32, ptr addrspace(1) %ptr1
|
||||
%load.1 = load <2 x i16>, ptr addrspace(1) %gep.1
|
||||
%store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1
|
||||
store i32 %load.0, ptr addrspace(2) %ptr2
|
||||
store <2 x i16> %load.1, ptr addrspace(2) %store.gep.1
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @no_merge_i32_ptr(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @no_merge_i32_ptr(
|
||||
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
|
||||
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 1
|
||||
; CHECK-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
|
||||
; CHECK-NEXT: [[LOAD_1:%.*]] = load ptr, ptr addrspace(1) [[GEP_1]], align 8
|
||||
; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 1
|
||||
; CHECK-NEXT: store i32 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 4
|
||||
; CHECK-NEXT: store ptr [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1
|
||||
%load.0 = load i32, ptr addrspace(1) %ptr1
|
||||
%load.1 = load ptr, ptr addrspace(1) %gep.1
|
||||
%store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1
|
||||
store i32 %load.0, ptr addrspace(2) %ptr2
|
||||
store ptr %load.1, ptr addrspace(2) %store.gep.1
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @no_merge_i32_half(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @no_merge_i32_half(
|
||||
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
|
||||
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 1
|
||||
; CHECK-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
|
||||
; CHECK-NEXT: [[LOAD_1:%.*]] = load half, ptr addrspace(1) [[GEP_1]], align 2
|
||||
; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 1
|
||||
; CHECK-NEXT: store i32 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 4
|
||||
; CHECK-NEXT: store half [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 2
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1
|
||||
%load.0 = load i32, ptr addrspace(1) %ptr1
|
||||
%load.1 = load half, ptr addrspace(1) %gep.1
|
||||
%store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1
|
||||
store i32 %load.0, ptr addrspace(2) %ptr2
|
||||
store half %load.1, ptr addrspace(2) %store.gep.1
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @merge_i32_float(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @merge_i32_float(
|
||||
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[PTR1]], align 4
|
||||
; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
|
||||
; CHECK-NEXT: [[LOAD_12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[LOAD_12]] to float
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_01]], i32 0
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[TMP2]] to i32
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i32 1
|
||||
; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(2) [[PTR2]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1
|
||||
%load.0 = load i32, ptr addrspace(1) %ptr1
|
||||
%load.1 = load float, ptr addrspace(1) %gep.1
|
||||
%store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1
|
||||
store i32 %load.0, ptr addrspace(2) %ptr2
|
||||
store float %load.1, ptr addrspace(2) %store.gep.1
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @no_merge_i32_double(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @no_merge_i32_double(
|
||||
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
|
||||
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 1
|
||||
; CHECK-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
|
||||
; CHECK-NEXT: [[LOAD_1:%.*]] = load double, ptr addrspace(1) [[GEP_1]], align 8
|
||||
; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 1
|
||||
; CHECK-NEXT: store i32 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 4
|
||||
; CHECK-NEXT: store double [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1
|
||||
%load.0 = load i32, ptr addrspace(1) %ptr1
|
||||
%load.1 = load double, ptr addrspace(1) %gep.1
|
||||
%store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1
|
||||
store i32 %load.0, ptr addrspace(2) %ptr2
|
||||
store double %load.1, ptr addrspace(2) %store.gep.1
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @merge_i64_ptr(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @merge_i64_ptr(
|
||||
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR1]], align 8
|
||||
; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
|
||||
; CHECK-NEXT: [[LOAD_12:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[LOAD_12]] to ptr
|
||||
; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(2) [[PTR2]], i64 1
|
||||
; CHECK-NEXT: store i64 [[LOAD_01]], ptr addrspace(2) [[PTR2]], align 8
|
||||
; CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(2) [[STORE_GEP_1]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%gep.1 = getelementptr inbounds i64, ptr addrspace(1) %ptr1, i64 1
|
||||
%load.0 = load i64, ptr addrspace(1) %ptr1
|
||||
%load.1 = load ptr, ptr addrspace(1) %gep.1
|
||||
%store.gep.1 = getelementptr inbounds i64, ptr addrspace(2) %ptr2, i64 1
|
||||
store i64 %load.0, ptr addrspace(2) %ptr2
|
||||
store ptr %load.1, ptr addrspace(2) %store.gep.1
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @no_merge_i64_float(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @no_merge_i64_float(
|
||||
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
|
||||
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[PTR1]], i64 1
|
||||
; CHECK-NEXT: [[LOAD_0:%.*]] = load i64, ptr addrspace(1) [[PTR1]], align 8
|
||||
; CHECK-NEXT: [[LOAD_1:%.*]] = load float, ptr addrspace(1) [[GEP_1]], align 4
|
||||
; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(2) [[PTR2]], i64 1
|
||||
; CHECK-NEXT: store i64 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 8
|
||||
; CHECK-NEXT: store float [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%gep.1 = getelementptr inbounds i64, ptr addrspace(1) %ptr1, i64 1
|
||||
%load.0 = load i64, ptr addrspace(1) %ptr1
|
||||
%load.1 = load float, ptr addrspace(1) %gep.1
|
||||
%store.gep.1 = getelementptr inbounds i64, ptr addrspace(2) %ptr2, i64 1
|
||||
store i64 %load.0, ptr addrspace(2) %ptr2
|
||||
store float %load.1, ptr addrspace(2) %store.gep.1
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @merge_i64_double(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @merge_i64_double(
|
||||
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR1]], align 8
|
||||
; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
|
||||
; CHECK-NEXT: [[LOAD_12:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[LOAD_12]] to double
|
||||
; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(2) [[PTR2]], i64 1
|
||||
; CHECK-NEXT: store i64 [[LOAD_01]], ptr addrspace(2) [[PTR2]], align 8
|
||||
; CHECK-NEXT: store double [[TMP2]], ptr addrspace(2) [[STORE_GEP_1]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%gep.1 = getelementptr inbounds i64, ptr addrspace(1) %ptr1, i64 1
|
||||
%load.0 = load i64, ptr addrspace(1) %ptr1
|
||||
%load.1 = load double, ptr addrspace(1) %gep.1
|
||||
%store.gep.1 = getelementptr inbounds i64, ptr addrspace(2) %ptr2, i64 1
|
||||
store i64 %load.0, ptr addrspace(2) %ptr2
|
||||
store double %load.1, ptr addrspace(2) %store.gep.1
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @merge_i64_v2i32(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @merge_i64_v2i32(
|
||||
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR1]], align 8
|
||||
; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
|
||||
; CHECK-NEXT: [[LOAD_1_MUT2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
|
||||
; CHECK-NEXT: [[LOAD_1_MUT_BC:%.*]] = bitcast i64 [[LOAD_1_MUT2]] to <2 x i32>
|
||||
; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(2) [[PTR2]], i64 1
|
||||
; CHECK-NEXT: store i64 [[LOAD_01]], ptr addrspace(2) [[PTR2]], align 8
|
||||
; CHECK-NEXT: [[LOAD_1_BC:%.*]] = bitcast <2 x i32> [[LOAD_1_MUT_BC]] to i64
|
||||
; CHECK-NEXT: store i64 [[LOAD_1_BC]], ptr addrspace(2) [[STORE_GEP_1]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%gep.1 = getelementptr inbounds i64, ptr addrspace(1) %ptr1, i64 1
|
||||
%load.0 = load i64, ptr addrspace(1) %ptr1
|
||||
%load.1 = load <2 x i32>, ptr addrspace(1) %gep.1
|
||||
%store.gep.1 = getelementptr inbounds i64, ptr addrspace(2) %ptr2, i64 1
|
||||
store i64 %load.0, ptr addrspace(2) %ptr2
|
||||
store <2 x i32> %load.1, ptr addrspace(2) %store.gep.1
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @merge_i32_v2i16_v4i8(ptr addrspace(1) %ptr1) {
|
||||
; CHECK-LABEL: define void @merge_i32_v2i16_v4i8(
|
||||
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]]) {
|
||||
; CHECK-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
|
||||
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 1
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[GEP2]], align 4
|
||||
; CHECK-NEXT: [[LOAD2_MUT1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
|
||||
; CHECK-NEXT: [[LOAD4_MUT2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
|
||||
; CHECK-NEXT: [[LOAD2_MUT_BC:%.*]] = bitcast i32 [[LOAD2_MUT1]] to <2 x i16>
|
||||
; CHECK-NEXT: [[LOAD4_MUT_BC:%.*]] = bitcast i32 [[LOAD4_MUT2]] to <4 x i8>
|
||||
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 3
|
||||
; CHECK-NEXT: [[LOAD_3:%.*]] = load float, ptr addrspace(1) [[GEP_3]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%load.0 = load i32, ptr addrspace(1) %ptr1, align 4
|
||||
%gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1
|
||||
%load.1 = load <2 x i16>, ptr addrspace(1) %gep.1, align 4
|
||||
%gep.2 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 2
|
||||
%load.2 = load <4 x i8>, ptr addrspace(1) %gep.2, align 4
|
||||
%gep.3 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 3
|
||||
%load.3 = load float, ptr addrspace(1) %gep.3, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @merge_float_v2f16_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @merge_float_v2f16_type(
|
||||
define void @merge_f32_v2f16_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @merge_f32_v2f16_type(
|
||||
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
|
||||
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 0
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[GEP1]], align 4
|
||||
; CHECK-NEXT: [[LOAD1_MUT1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
|
||||
; CHECK-NEXT: [[LOAD2_MUT2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
|
||||
; CHECK-NEXT: [[LOAD1_TOORIG:%.*]] = bitcast i32 [[LOAD1_MUT1]] to float
|
||||
; CHECK-NEXT: [[LOAD2_TOORIG:%.*]] = bitcast i32 [[LOAD2_MUT2]] to <2 x half>
|
||||
; CHECK-NEXT: [[LOAD1:%.*]] = load float, ptr addrspace(1) [[GEP1]], align 4
|
||||
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[PTR1]], i64 1
|
||||
; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[GEP2]], align 4
|
||||
; CHECK-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0
|
||||
; CHECK-NEXT: [[LOAD1_BC:%.*]] = bitcast float [[LOAD1_TOORIG]] to i32
|
||||
; CHECK-NEXT: [[LOAD2_BC:%.*]] = bitcast <2 x half> [[LOAD2_TOORIG]] to i32
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD1_BC]], i32 0
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOAD2_BC]], i32 1
|
||||
; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(2) [[STORE_GEP1]], align 4
|
||||
; CHECK-NEXT: store float [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4
|
||||
; CHECK-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(2) [[PTR2]], i64 1
|
||||
; CHECK-NEXT: store <2 x half> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%gep1 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 0
|
||||
@ -304,3 +88,27 @@ define void @merge_v2f16_bfloat_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %p
|
||||
store <2 x half> %load2, ptr addrspace(2) %store.gep2, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @no_merge_mixed_ptr_addrspaces(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-LABEL: define void @no_merge_mixed_ptr_addrspaces(
|
||||
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
|
||||
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[PTR1]], i64 0
|
||||
; CHECK-NEXT: [[LOAD1:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[GEP1]], align 4
|
||||
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds ptr addrspace(2), ptr addrspace(1) [[PTR1]], i64 1
|
||||
; CHECK-NEXT: [[LOAD2:%.*]] = load ptr addrspace(2), ptr addrspace(1) [[GEP2]], align 4
|
||||
; CHECK-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0
|
||||
; CHECK-NEXT: store ptr addrspace(1) [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4
|
||||
; CHECK-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds ptr addrspace(2), ptr addrspace(2) [[PTR2]], i64 1
|
||||
; CHECK-NEXT: store ptr addrspace(2) [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%gep1 = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %ptr1, i64 0
|
||||
%load1 = load ptr addrspace(1), ptr addrspace(1) %gep1, align 4
|
||||
%gep2 = getelementptr inbounds ptr addrspace(2), ptr addrspace(1) %ptr1, i64 1
|
||||
%load2 = load ptr addrspace(2), ptr addrspace(1) %gep2, align 4
|
||||
%store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0
|
||||
store ptr addrspace(1) %load1, ptr addrspace(2) %store.gep1, align 4
|
||||
%store.gep2 = getelementptr inbounds ptr addrspace(2), ptr addrspace(2) %ptr2, i64 1
|
||||
store ptr addrspace(2) %load2, ptr addrspace(2) %store.gep2, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -130,14 +130,14 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Ideally this would be merged
|
||||
define amdgpu_kernel void @merge_load_i32_v2i16(ptr addrspace(1) nocapture %a) #0 {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i16(
|
||||
; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[A]], align 4
|
||||
; CHECK-NEXT: [[LD_01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
|
||||
; CHECK-NEXT: [[LD_1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
|
||||
; CHECK-NEXT: [[LD_1_TOORIG:%.*]] = bitcast i32 [[LD_1_MUT2]] to <2 x i16>
|
||||
; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1
|
||||
; CHECK-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4
|
||||
; CHECK-NEXT: [[LD_1:%.*]] = load <2 x i16>, ptr addrspace(1) [[A_1]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
@ -149,283 +149,5 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @no_merge_load_i32_v2i8(ptr addrspace(1) nocapture %a) #0 {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @no_merge_load_i32_v2i8(
|
||||
; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1
|
||||
; CHECK-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4
|
||||
; CHECK-NEXT: [[LD_1:%.*]] = load <2 x i8>, ptr addrspace(1) [[A_1]], align 2
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1
|
||||
|
||||
%ld.0 = load i32, ptr addrspace(1) %a
|
||||
%ld.1 = load <2 x i8>, ptr addrspace(1) %a.1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_normalize_loads(ptr %p) {
|
||||
; CHECK-OOB-RELAXED-LABEL: define void @test_normalize_loads(
|
||||
; CHECK-OOB-RELAXED-SAME: ptr [[P:%.*]]) #[[ATTR1:[0-9]+]] {
|
||||
; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4
|
||||
; CHECK-OOB-RELAXED-NEXT: [[L01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
|
||||
; CHECK-OOB-RELAXED-NEXT: [[L1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
|
||||
; CHECK-OOB-RELAXED-NEXT: [[L1_MUT_BC:%.*]] = bitcast i32 [[L1_MUT2]] to <2 x i16>
|
||||
; CHECK-OOB-RELAXED-NEXT: [[L0_EXT:%.*]] = zext i32 [[L01]] to i64
|
||||
; CHECK-OOB-RELAXED-NEXT: [[L1_CAST:%.*]] = bitcast <2 x i16> [[L1_MUT_BC]] to i32
|
||||
; CHECK-OOB-RELAXED-NEXT: [[L1_EXT:%.*]] = zext i32 [[L1_CAST]] to i64
|
||||
; CHECK-OOB-RELAXED-NEXT: [[ADD:%.*]] = add i64 [[L0_EXT]], [[L1_EXT]]
|
||||
; CHECK-OOB-RELAXED-NEXT: store i64 [[ADD]], ptr null, align 8
|
||||
; CHECK-OOB-RELAXED-NEXT: ret void
|
||||
;
|
||||
; CHECK-OOB-STRICT-LABEL: define void @test_normalize_loads(
|
||||
; CHECK-OOB-STRICT-SAME: ptr [[P:%.*]]) {
|
||||
; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-OOB-STRICT-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4
|
||||
; CHECK-OOB-STRICT-NEXT: [[L01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
|
||||
; CHECK-OOB-STRICT-NEXT: [[L1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
|
||||
; CHECK-OOB-STRICT-NEXT: [[L1_MUT_BC:%.*]] = bitcast i32 [[L1_MUT2]] to <2 x i16>
|
||||
; CHECK-OOB-STRICT-NEXT: [[L0_EXT:%.*]] = zext i32 [[L01]] to i64
|
||||
; CHECK-OOB-STRICT-NEXT: [[L1_CAST:%.*]] = bitcast <2 x i16> [[L1_MUT_BC]] to i32
|
||||
; CHECK-OOB-STRICT-NEXT: [[L1_EXT:%.*]] = zext i32 [[L1_CAST]] to i64
|
||||
; CHECK-OOB-STRICT-NEXT: [[ADD:%.*]] = add i64 [[L0_EXT]], [[L1_EXT]]
|
||||
; CHECK-OOB-STRICT-NEXT: store i64 [[ADD]], ptr null, align 8
|
||||
; CHECK-OOB-STRICT-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%p1 = getelementptr i32, ptr %p, i64 1
|
||||
%l0 = load i32, ptr %p
|
||||
%l1 = load <2 x i16>, ptr %p1
|
||||
%l0_ext = zext i32 %l0 to i64
|
||||
%l1_cast = bitcast <2 x i16> %l1 to i32
|
||||
%l1_ext = zext i32 %l1_cast to i64
|
||||
%add = add i64 %l0_ext, %l1_ext
|
||||
store i64 %add, ptr null
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_normalize_stores(ptr %p) {
|
||||
; CHECK-OOB-RELAXED-LABEL: define void @test_normalize_stores(
|
||||
; CHECK-OOB-RELAXED-SAME: ptr [[P:%.*]]) #[[ATTR1]] {
|
||||
; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-OOB-RELAXED-NEXT: store <2 x i32> <i32 123, i32 bitcast (<2 x i16> <i16 4, i16 5> to i32)>, ptr [[P]], align 4
|
||||
; CHECK-OOB-RELAXED-NEXT: ret void
|
||||
;
|
||||
; CHECK-OOB-STRICT-LABEL: define void @test_normalize_stores(
|
||||
; CHECK-OOB-STRICT-SAME: ptr [[P:%.*]]) {
|
||||
; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-OOB-STRICT-NEXT: store <2 x i32> <i32 123, i32 bitcast (<2 x i16> <i16 4, i16 5> to i32)>, ptr [[P]], align 4
|
||||
; CHECK-OOB-STRICT-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%p1 = getelementptr i32, ptr %p, i64 1
|
||||
store i32 123, ptr %p
|
||||
store <2 x i16> <i16 4, i16 5>, ptr %p1
|
||||
ret void
|
||||
}
|
||||
|
||||
; TODO: Fix the below test
|
||||
; Check that metadata on loads is preserved when LSV normalizes mixed-typed
|
||||
; chains (exercises copyMetadataForAccess on loads).
|
||||
define void @lsv_copy_load_metadata(ptr %p) {
|
||||
; CHECK-OOB-RELAXED-LABEL: define void @lsv_copy_load_metadata(
|
||||
; CHECK-OOB-RELAXED-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
|
||||
; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]], !invariant.load [[META3:![0-9]+]], !nontemporal [[META4:![0-9]+]]
|
||||
; CHECK-OOB-RELAXED-NEXT: [[L01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
|
||||
; CHECK-OOB-RELAXED-NEXT: [[L1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
|
||||
; CHECK-OOB-RELAXED-NEXT: [[L1_MUT_BC:%.*]] = bitcast i32 [[L1_MUT2]] to <2 x i16>
|
||||
; CHECK-OOB-RELAXED-NEXT: ret void
|
||||
;
|
||||
; CHECK-OOB-STRICT-LABEL: define void @lsv_copy_load_metadata(
|
||||
; CHECK-OOB-STRICT-SAME: ptr [[P:%.*]]) {
|
||||
; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-OOB-STRICT-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]], !invariant.load [[META3:![0-9]+]], !nontemporal [[META4:![0-9]+]]
|
||||
; CHECK-OOB-STRICT-NEXT: [[L01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
|
||||
; CHECK-OOB-STRICT-NEXT: [[L1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
|
||||
; CHECK-OOB-STRICT-NEXT: [[L1_MUT_BC:%.*]] = bitcast i32 [[L1_MUT2]] to <2 x i16>
|
||||
; CHECK-OOB-STRICT-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%p1 = getelementptr i32, ptr %p, i64 1
|
||||
%ld0 = load i32, ptr %p, align 4, !tbaa !0, !nontemporal !5, !invariant.load !6
|
||||
%ld1 = load <2 x i16>, ptr %p1, align 4, !tbaa !0, !nontemporal !5, !invariant.load !6
|
||||
ret void
|
||||
}
|
||||
|
||||
; Check that metadata on stores is preserved when LSV normalizes mixed-typed
|
||||
; chains (exercises copyMetadataForAccess on stores).
|
||||
define void @lsv_copy_store_metadata(ptr %p) {
|
||||
; CHECK-OOB-RELAXED-LABEL: define void @lsv_copy_store_metadata(
|
||||
; CHECK-OOB-RELAXED-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
|
||||
; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-OOB-RELAXED-NEXT: store <2 x i32> <i32 7, i32 bitcast (<2 x i16> <i16 4, i16 5> to i32)>, ptr [[P]], align 4, !nontemporal [[META4]]
|
||||
; CHECK-OOB-RELAXED-NEXT: ret void
|
||||
;
|
||||
; CHECK-OOB-STRICT-LABEL: define void @lsv_copy_store_metadata(
|
||||
; CHECK-OOB-STRICT-SAME: ptr [[P:%.*]]) {
|
||||
; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-OOB-STRICT-NEXT: store <2 x i32> <i32 7, i32 bitcast (<2 x i16> <i16 4, i16 5> to i32)>, ptr [[P]], align 4, !nontemporal [[META4]]
|
||||
; CHECK-OOB-STRICT-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%p1 = getelementptr i32, ptr %p, i64 1
|
||||
store i32 7, ptr %p, align 4, !nontemporal !5
|
||||
store <2 x i16> <i16 4, i16 5>, ptr %p1, align 4, !nontemporal !5
|
||||
ret void
|
||||
}
|
||||
|
||||
!0 = !{!3, !3, i64 0}
|
||||
!3 = !{!"omnipotent char", !4, i64 0}
|
||||
!4 = !{!"Simple C/C++ TBAA"}
|
||||
!5 = !{i32 1}
|
||||
!6 = !{}
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
||||
|
||||
|
||||
; Non power-of-two combined span (12 bytes) must not merge chains.
|
||||
define void @no_merge_non_pot_span(ptr addrspace(1) %p) {
|
||||
; CHECK-OOB-RELAXED-LABEL: define void @no_merge_non_pot_span(
|
||||
; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[P:%.*]]) #[[ATTR1]] {
|
||||
; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-OOB-RELAXED-NEXT: [[L0:%.*]] = load i32, ptr addrspace(1) [[P]], align 4
|
||||
; CHECK-OOB-RELAXED-NEXT: [[P8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 8
|
||||
; CHECK-OOB-RELAXED-NEXT: [[L1:%.*]] = load float, ptr addrspace(1) [[P8]], align 4
|
||||
; CHECK-OOB-RELAXED-NEXT: ret void
|
||||
;
|
||||
; CHECK-OOB-STRICT-LABEL: define void @no_merge_non_pot_span(
|
||||
; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[P:%.*]]) {
|
||||
; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-OOB-STRICT-NEXT: [[L0:%.*]] = load i32, ptr addrspace(1) [[P]], align 4
|
||||
; CHECK-OOB-STRICT-NEXT: [[P8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 8
|
||||
; CHECK-OOB-STRICT-NEXT: [[L1:%.*]] = load float, ptr addrspace(1) [[P8]], align 4
|
||||
; CHECK-OOB-STRICT-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%l0 = load i32, ptr addrspace(1) %p, align 4
|
||||
%p8 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 8
|
||||
%l1 = load float, ptr addrspace(1) %p8, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @no_merge_diff_ptrop(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
|
||||
; CHECK-OOB-RELAXED-LABEL: define void @no_merge_diff_ptrop(
|
||||
; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) #[[ATTR1]] {
|
||||
; CHECK-OOB-RELAXED-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
|
||||
; CHECK-OOB-RELAXED-NEXT: [[LOAD_1:%.*]] = load i32, ptr addrspace(2) [[PTR2]], align 4
|
||||
; CHECK-OOB-RELAXED-NEXT: ret void
|
||||
;
|
||||
; CHECK-OOB-STRICT-LABEL: define void @no_merge_diff_ptrop(
|
||||
; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
|
||||
; CHECK-OOB-STRICT-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
|
||||
; CHECK-OOB-STRICT-NEXT: [[LOAD_1:%.*]] = load i32, ptr addrspace(2) [[PTR2]], align 4
|
||||
; CHECK-OOB-STRICT-NEXT: ret void
|
||||
;
|
||||
%load.0 = load i32, ptr addrspace(1) %ptr1
|
||||
%load.1 = load i32, ptr addrspace(2) %ptr2
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @no_merge_load_store(ptr addrspace(1) %ptr1) {
|
||||
; CHECK-OOB-RELAXED-LABEL: define void @no_merge_load_store(
|
||||
; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR1:%.*]]) #[[ATTR1]] {
|
||||
; CHECK-OOB-RELAXED-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
|
||||
; CHECK-OOB-RELAXED-NEXT: store i32 111, ptr addrspace(1) [[PTR1]], align 4
|
||||
; CHECK-OOB-RELAXED-NEXT: ret void
|
||||
;
|
||||
; CHECK-OOB-STRICT-LABEL: define void @no_merge_load_store(
|
||||
; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR1:%.*]]) {
|
||||
; CHECK-OOB-STRICT-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
|
||||
; CHECK-OOB-STRICT-NEXT: store i32 111, ptr addrspace(1) [[PTR1]], align 4
|
||||
; CHECK-OOB-STRICT-NEXT: ret void
|
||||
;
|
||||
%load.0 = load i32, ptr addrspace(1) %ptr1
|
||||
store i32 111, ptr addrspace(1) %ptr1
|
||||
ret void
|
||||
}
|
||||
|
||||
; Stores in this test should not be vectorized as the total byte span
|
||||
; from the end of %gep.a to the end of %gep.b is not a power of 2. This
|
||||
; is a necessary condition for splitChainByAlignment.
|
||||
define void @check_contiguity_of_base_ptrs(ptr addrspace(1) %ptr) {
|
||||
; CHECK-OOB-RELAXED-LABEL: define void @check_contiguity_of_base_ptrs(
|
||||
; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
|
||||
; CHECK-OOB-RELAXED-NEXT: store i32 274, ptr addrspace(1) [[PTR]], align 4
|
||||
; CHECK-OOB-RELAXED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[PTR]], i64 4
|
||||
; CHECK-OOB-RELAXED-NEXT: store i64 3610770474484254748, ptr addrspace(1) [[GEP_A]], align 8
|
||||
; CHECK-OOB-RELAXED-NEXT: [[GEP_B:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[PTR]], i64 12
|
||||
; CHECK-OOB-RELAXED-NEXT: store <2 x i32> <i32 1819043144, i32 1867980911>, ptr addrspace(1) [[GEP_B]], align 4
|
||||
; CHECK-OOB-RELAXED-NEXT: ret void
|
||||
;
|
||||
; CHECK-OOB-STRICT-LABEL: define void @check_contiguity_of_base_ptrs(
|
||||
; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR:%.*]]) {
|
||||
; CHECK-OOB-STRICT-NEXT: store i32 274, ptr addrspace(1) [[PTR]], align 4
|
||||
; CHECK-OOB-STRICT-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[PTR]], i64 4
|
||||
; CHECK-OOB-STRICT-NEXT: store i64 3610770474484254748, ptr addrspace(1) [[GEP_A]], align 8
|
||||
; CHECK-OOB-STRICT-NEXT: [[GEP_B:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[PTR]], i64 12
|
||||
; CHECK-OOB-STRICT-NEXT: store <2 x i32> <i32 1819043144, i32 1867980911>, ptr addrspace(1) [[GEP_B]], align 4
|
||||
; CHECK-OOB-STRICT-NEXT: ret void
|
||||
;
|
||||
store i32 274, ptr addrspace(1) %ptr, align 4
|
||||
%gep.a = getelementptr inbounds nuw i8, ptr addrspace(1) %ptr, i64 4
|
||||
store i64 3610770474484254748, ptr addrspace(1) %gep.a, align 8
|
||||
%gep.b = getelementptr inbounds nuw i8, ptr addrspace(1) %ptr, i64 12
|
||||
store <2 x i32> <i32 1819043144, i32 1867980911>, ptr addrspace(1) %gep.b, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Offset is unknown in the following test, LSV should fail to vectorize.
|
||||
define amdgpu_kernel void @assert_computeLeaderDelta(ptr addrspace(1) %a, i64 %idx) {
|
||||
; CHECK-OOB-RELAXED-LABEL: define amdgpu_kernel void @assert_computeLeaderDelta(
|
||||
; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[A:%.*]], i64 [[IDX:%.*]]) #[[ATTR1]] {
|
||||
; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-OOB-RELAXED-NEXT: [[LD0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4
|
||||
; CHECK-OOB-RELAXED-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[A]], i64 [[IDX]]
|
||||
; CHECK-OOB-RELAXED-NEXT: [[LD1:%.*]] = load <2 x i16>, ptr addrspace(1) [[P1]], align 2
|
||||
; CHECK-OOB-RELAXED-NEXT: ret void
|
||||
;
|
||||
; CHECK-OOB-STRICT-LABEL: define amdgpu_kernel void @assert_computeLeaderDelta(
|
||||
; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[A:%.*]], i64 [[IDX:%.*]]) {
|
||||
; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-OOB-STRICT-NEXT: [[LD0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4
|
||||
; CHECK-OOB-STRICT-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[A]], i64 [[IDX]]
|
||||
; CHECK-OOB-STRICT-NEXT: [[LD1:%.*]] = load <2 x i16>, ptr addrspace(1) [[P1]], align 2
|
||||
; CHECK-OOB-STRICT-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%ld0 = load i32, ptr addrspace(1) %a, align 4
|
||||
%p1 = getelementptr inbounds i8, ptr addrspace(1) %a, i64 %idx
|
||||
%ld1 = load <2 x i16>, ptr addrspace(1) %p1, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; Overlapping ranges after rebasing should prevent merging across chains.
|
||||
define void @no_merge_overlap_after_rebase(ptr addrspace(1) %p) {
|
||||
; CHECK-OOB-RELAXED-LABEL: define void @no_merge_overlap_after_rebase(
|
||||
; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[P:%.*]]) #[[ATTR1]] {
|
||||
; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-OOB-RELAXED-NEXT: [[L0:%.*]] = load i32, ptr addrspace(1) [[P]], align 4
|
||||
; CHECK-OOB-RELAXED-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 2
|
||||
; CHECK-OOB-RELAXED-NEXT: [[L1:%.*]] = load <2 x i16>, ptr addrspace(1) [[P1]], align 2
|
||||
; CHECK-OOB-RELAXED-NEXT: ret void
|
||||
;
|
||||
; CHECK-OOB-STRICT-LABEL: define void @no_merge_overlap_after_rebase(
|
||||
; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[P:%.*]]) {
|
||||
; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-OOB-STRICT-NEXT: [[L0:%.*]] = load i32, ptr addrspace(1) [[P]], align 4
|
||||
; CHECK-OOB-STRICT-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 2
|
||||
; CHECK-OOB-STRICT-NEXT: [[L1:%.*]] = load <2 x i16>, ptr addrspace(1) [[P1]], align 2
|
||||
; CHECK-OOB-STRICT-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%l0 = load i32, ptr addrspace(1) %p, align 4
|
||||
%p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 2
|
||||
%l1 = load <2 x i16>, ptr addrspace(1) %p1, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user