llvm-project/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
Nikita Popov 573ca36753
[IR] Replace alignment argument with attribute on masked intrinsics (#163802)
The `masked.load`, `masked.store`, `masked.gather` and `masked.scatter`
intrinsics currently accept a separate alignment immarg. Replace this
with an `align` attribute on the pointer / vector of pointers argument.

This is the standard representation for alignment information on
intrinsics, and is already used by all other memory intrinsics. This
means the signatures now match llvm.expandload, llvm.vp.load, etc.
(Things like llvm.memcpy used to have a separate alignment argument as
well, but were already migrated a long time ago.)

It's worth noting that the masked.gather and masked.scatter intrinsics
previously accepted a zero alignment to indicate the ABI type alignment
of the element type. This special case is gone now: If the align
attribute is omitted, the implied alignment is 1, as usual. If ABI
alignment is desired, it needs to be explicitly emitted (which the
IRBuilder API already requires anyway).
2025-10-20 08:50:09 +00:00

466 lines
17 KiB
C++

//===-- RISCVInterleavedAccess.cpp - RISC-V Interleaved Access Transform --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Functions and callbacks related to the InterleavedAccessPass.
//
//===----------------------------------------------------------------------===//
#include "RISCV.h"
#include "RISCVISelLowering.h"
#include "RISCVSubtarget.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsRISCV.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
using namespace llvm;
bool RISCVTargetLowering::isLegalInterleavedAccessType(
VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace,
const DataLayout &DL) const {
EVT VT = getValueType(DL, VTy);
// Don't lower vlseg/vsseg for vector types that can't be split.
if (!isTypeLegal(VT))
return false;
if (!isLegalElementTypeForRVV(VT.getScalarType()) ||
!allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace,
Alignment))
return false;
MVT ContainerVT = VT.getSimpleVT();
if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
if (!Subtarget.useRVVForFixedLengthVectors())
return false;
// Sometimes the interleaved access pass picks up splats as interleaves of
// one element. Don't lower these.
if (FVTy->getNumElements() < 2)
return false;
ContainerVT = getContainerForFixedLengthVector(VT.getSimpleVT());
}
// Need to make sure that EMUL * NFIELDS ≤ 8
auto [LMUL, Fractional] = RISCVVType::decodeVLMUL(getLMUL(ContainerVT));
if (Fractional)
return true;
return Factor * LMUL <= 8;
}
static const Intrinsic::ID FixedVlsegIntrIds[] = {
Intrinsic::riscv_seg2_load_mask, Intrinsic::riscv_seg3_load_mask,
Intrinsic::riscv_seg4_load_mask, Intrinsic::riscv_seg5_load_mask,
Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask,
Intrinsic::riscv_seg8_load_mask};
static const Intrinsic::ID FixedVlssegIntrIds[] = {
Intrinsic::riscv_sseg2_load_mask, Intrinsic::riscv_sseg3_load_mask,
Intrinsic::riscv_sseg4_load_mask, Intrinsic::riscv_sseg5_load_mask,
Intrinsic::riscv_sseg6_load_mask, Intrinsic::riscv_sseg7_load_mask,
Intrinsic::riscv_sseg8_load_mask};
static const Intrinsic::ID ScalableVlsegIntrIds[] = {
Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
Intrinsic::riscv_vlseg8_mask};
static const Intrinsic::ID FixedVssegIntrIds[] = {
Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
Intrinsic::riscv_seg8_store_mask};
static const Intrinsic::ID FixedVsssegIntrIds[] = {
Intrinsic::riscv_sseg2_store_mask, Intrinsic::riscv_sseg3_store_mask,
Intrinsic::riscv_sseg4_store_mask, Intrinsic::riscv_sseg5_store_mask,
Intrinsic::riscv_sseg6_store_mask, Intrinsic::riscv_sseg7_store_mask,
Intrinsic::riscv_sseg8_store_mask};
static const Intrinsic::ID ScalableVssegIntrIds[] = {
Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
Intrinsic::riscv_vsseg8_mask};
static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
assert(N);
if (N == 1)
return true;
using namespace PatternMatch;
// Right now we're only recognizing the simplest pattern.
uint64_t C;
if (match(V, m_CombineOr(m_ConstantInt(C),
m_NUWMul(m_Value(), m_ConstantInt(C)))) &&
C && C % N == 0)
return true;
if (isPowerOf2_32(N)) {
KnownBits KB = llvm::computeKnownBits(V, DL);
return KB.countMinTrailingZeros() >= Log2_32(N);
}
return false;
}
/// Do the common operand retrieval and validition required by the
/// routines below.
static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
Instruction *I, Value *&Ptr, Value *&Mask,
Value *&VL, Align &Alignment) {
IRBuilder<> Builder(I);
const DataLayout &DL = I->getDataLayout();
ElementCount EC = VTy->getElementCount();
if (auto *LI = dyn_cast<LoadInst>(I)) {
assert(LI->isSimple());
Ptr = LI->getPointerOperand();
Alignment = LI->getAlign();
assert(!Mask && "Unexpected mask on a load");
Mask = Builder.getAllOnesMask(EC);
VL = isa<FixedVectorType>(VTy) ? Builder.CreateElementCount(XLenTy, EC)
: Constant::getAllOnesValue(XLenTy);
return true;
}
if (auto *SI = dyn_cast<StoreInst>(I)) {
assert(SI->isSimple());
Ptr = SI->getPointerOperand();
Alignment = SI->getAlign();
assert(!Mask && "Unexpected mask on a store");
Mask = Builder.getAllOnesMask(EC);
VL = isa<FixedVectorType>(VTy) ? Builder.CreateElementCount(XLenTy, EC)
: Constant::getAllOnesValue(XLenTy);
return true;
}
auto *II = cast<IntrinsicInst>(I);
switch (II->getIntrinsicID()) {
default:
llvm_unreachable("Unsupported intrinsic type");
case Intrinsic::vp_load:
case Intrinsic::vp_store: {
auto *VPLdSt = cast<VPIntrinsic>(I);
Ptr = VPLdSt->getMemoryPointerParam();
Alignment = VPLdSt->getPointerAlignment().value_or(
DL.getABITypeAlign(VTy->getElementType()));
assert(Mask && "vp.load and vp.store needs a mask!");
Value *WideEVL = VPLdSt->getVectorLengthParam();
// Conservatively check if EVL is a multiple of factor, otherwise some
// (trailing) elements might be lost after the transformation.
if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor))
return false;
auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
return true;
}
case Intrinsic::masked_load: {
Ptr = II->getOperand(0);
Alignment = II->getParamAlign(0).valueOrOne();
if (!isa<UndefValue>(II->getOperand(2)))
return false;
assert(Mask && "masked.load needs a mask!");
VL = isa<FixedVectorType>(VTy)
? Builder.CreateElementCount(XLenTy, VTy->getElementCount())
: Constant::getAllOnesValue(XLenTy);
return true;
}
case Intrinsic::masked_store: {
Ptr = II->getOperand(1);
Alignment = II->getParamAlign(1).valueOrOne();
assert(Mask && "masked.store needs a mask!");
VL = isa<FixedVectorType>(VTy)
? Builder.CreateElementCount(XLenTy, VTy->getElementCount())
: Constant::getAllOnesValue(XLenTy);
return true;
}
}
}
/// Lower an interleaved load into a vlsegN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
///
/// Into:
/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.riscv.seg2.load.v4i32.p0.i64(
/// %ptr, i64 4)
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool RISCVTargetLowering::lowerInterleavedLoad(
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
assert(Indices.size() == Shuffles.size());
assert(GapMask.getBitWidth() == Factor);
// We only support cases where the skipped fields are the trailing ones.
// TODO: Lower to strided load if there is only a single active field.
unsigned MaskFactor = GapMask.popcount();
if (MaskFactor < 2 || !GapMask.isMask())
return false;
IRBuilder<> Builder(Load);
const DataLayout &DL = Load->getDataLayout();
auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
Value *Ptr, *VL;
Align Alignment;
if (!getMemOperands(MaskFactor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
return false;
Type *PtrTy = Ptr->getType();
unsigned AS = PtrTy->getPointerAddressSpace();
if (!isLegalInterleavedAccessType(VTy, MaskFactor, Alignment, AS, DL))
return false;
CallInst *SegLoad = nullptr;
if (MaskFactor < Factor) {
// Lower to strided segmented load.
unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
SegLoad = Builder.CreateIntrinsic(FixedVlssegIntrIds[MaskFactor - 2],
{VTy, PtrTy, XLenTy, XLenTy},
{Ptr, Stride, Mask, VL});
} else {
// Lower to normal segmented load.
SegLoad = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
{VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
}
for (unsigned i = 0; i < Shuffles.size(); i++) {
unsigned FactorIdx = Indices[i];
if (FactorIdx >= MaskFactor) {
// Replace masked-off factors (that are still extracted) with poison.
Shuffles[i]->replaceAllUsesWith(PoisonValue::get(VTy));
} else {
Value *SubVec = Builder.CreateExtractValue(SegLoad, FactorIdx);
Shuffles[i]->replaceAllUsesWith(SubVec);
}
}
return true;
}
/// Lower an interleaved store into a vssegN intrinsic.
///
/// E.g. Lower an interleaved store (Factor = 3):
/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
/// store <12 x i32> %i.vec, <12 x i32>* %ptr
///
/// Into:
/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
/// call void llvm.riscv.seg3.store.v4i32.p0.i64(%sub.v0, %sub.v1, %sub.v2,
/// %ptr, i32 4)
///
/// Note that the new shufflevectors will be removed and we'll only generate one
/// vsseg3 instruction in CodeGen.
bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor,
const APInt &GapMask) const {
assert(GapMask.getBitWidth() == Factor);
// We only support cases where the skipped fields are the trailing ones.
// TODO: Lower to strided store if there is only a single active field.
unsigned MaskFactor = GapMask.popcount();
if (MaskFactor < 2 || !GapMask.isMask())
return false;
IRBuilder<> Builder(Store);
const DataLayout &DL = Store->getDataLayout();
auto Mask = SVI->getShuffleMask();
auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
// Given SVI : <n*factor x ty>, then VTy : <n x ty>
auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
ShuffleVTy->getNumElements() / Factor);
auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
Value *Ptr, *VL;
Align Alignment;
if (!getMemOperands(MaskFactor, VTy, XLenTy, Store, Ptr, LaneMask, VL,
Alignment))
return false;
Type *PtrTy = Ptr->getType();
unsigned AS = PtrTy->getPointerAddressSpace();
if (!isLegalInterleavedAccessType(VTy, MaskFactor, Alignment, AS, DL))
return false;
Function *SegStoreFunc;
if (MaskFactor < Factor)
// Strided segmented store.
SegStoreFunc = Intrinsic::getOrInsertDeclaration(
Store->getModule(), FixedVsssegIntrIds[MaskFactor - 2],
{VTy, PtrTy, XLenTy, XLenTy});
else
// Normal segmented store.
SegStoreFunc = Intrinsic::getOrInsertDeclaration(
Store->getModule(), FixedVssegIntrIds[Factor - 2],
{VTy, PtrTy, XLenTy});
SmallVector<Value *, 10> Ops;
SmallVector<int, 16> NewShuffleMask;
for (unsigned i = 0; i < MaskFactor; i++) {
// Collect shuffle mask for this lane.
for (unsigned j = 0; j < VTy->getNumElements(); j++)
NewShuffleMask.push_back(Mask[i + Factor * j]);
Value *Shuffle = Builder.CreateShuffleVector(
SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask);
Ops.push_back(Shuffle);
NewShuffleMask.clear();
}
Ops.push_back(Ptr);
if (MaskFactor < Factor) {
// Insert the stride argument.
unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
Ops.push_back(ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes));
}
Ops.append({LaneMask, VL});
Builder.CreateCall(SegStoreFunc, Ops);
return true;
}
bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
if (Factor > 8)
return false;
IRBuilder<> Builder(Load);
VectorType *ResVTy = getDeinterleavedVectorType(DI);
const DataLayout &DL = Load->getDataLayout();
auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
Value *Ptr, *VL;
Align Alignment;
if (!getMemOperands(Factor, ResVTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
return false;
Type *PtrTy = Ptr->getType();
unsigned AS = PtrTy->getPointerAddressSpace();
if (!isLegalInterleavedAccessType(ResVTy, Factor, Alignment, AS, DL))
return false;
Value *Return;
if (isa<FixedVectorType>(ResVTy)) {
Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
{ResVTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
} else {
unsigned SEW = DL.getTypeSizeInBits(ResVTy->getElementType());
unsigned NumElts = ResVTy->getElementCount().getKnownMinValue();
Type *VecTupTy = TargetExtType::get(
Load->getContext(), "riscv.vector.tuple",
ScalableVectorType::get(Builder.getInt8Ty(), NumElts * SEW / 8),
Factor);
Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
{VecTupTy, PtrTy, Mask->getType(), VL->getType()});
Value *Operands[] = {
PoisonValue::get(VecTupTy),
Ptr,
Mask,
VL,
ConstantInt::get(XLenTy,
RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
ConstantInt::get(XLenTy, Log2_64(SEW))};
CallInst *Vlseg = Builder.CreateCall(VlsegNFunc, Operands);
SmallVector<Type *, 2> AggrTypes{Factor, ResVTy};
Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
for (unsigned i = 0; i < Factor; ++i) {
Value *VecExtract = Builder.CreateIntrinsic(
Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy},
{Vlseg, Builder.getInt32(i)});
Return = Builder.CreateInsertValue(Return, VecExtract, i);
}
}
DI->replaceAllUsesWith(Return);
return true;
}
bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
Instruction *Store, Value *Mask, ArrayRef<Value *> InterleaveValues) const {
unsigned Factor = InterleaveValues.size();
if (Factor > 8)
return false;
IRBuilder<> Builder(Store);
auto *InVTy = cast<VectorType>(InterleaveValues[0]->getType());
const DataLayout &DL = Store->getDataLayout();
Type *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
Value *Ptr, *VL;
Align Alignment;
if (!getMemOperands(Factor, InVTy, XLenTy, Store, Ptr, Mask, VL, Alignment))
return false;
Type *PtrTy = Ptr->getType();
unsigned AS = Ptr->getType()->getPointerAddressSpace();
if (!isLegalInterleavedAccessType(InVTy, Factor, Alignment, AS, DL))
return false;
if (isa<FixedVectorType>(InVTy)) {
Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
Store->getModule(), FixedVssegIntrIds[Factor - 2],
{InVTy, PtrTy, XLenTy});
SmallVector<Value *, 10> Ops(InterleaveValues);
Ops.append({Ptr, Mask, VL});
Builder.CreateCall(VssegNFunc, Ops);
return true;
}
unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType());
unsigned NumElts = InVTy->getElementCount().getKnownMinValue();
Type *VecTupTy = TargetExtType::get(
Store->getContext(), "riscv.vector.tuple",
ScalableVectorType::get(Builder.getInt8Ty(), NumElts * SEW / 8), Factor);
Value *StoredVal = PoisonValue::get(VecTupTy);
for (unsigned i = 0; i < Factor; ++i)
StoredVal = Builder.CreateIntrinsic(
Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy},
{StoredVal, InterleaveValues[i], Builder.getInt32(i)});
Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
Store->getModule(), ScalableVssegIntrIds[Factor - 2],
{VecTupTy, PtrTy, Mask->getType(), VL->getType()});
Value *Operands[] = {StoredVal, Ptr, Mask, VL,
ConstantInt::get(XLenTy, Log2_64(SEW))};
Builder.CreateCall(VssegNFunc, Operands);
return true;
}