[NVPTX] Use PRMT instruction to lower i16 bswap (#168968)
Previously, i16 `bswap` was lowered using multiple shift and OR operations. This patch adds a pattern to directly lower i16 `bswap` using the `PRMT` (permute) instruction, which is more efficient. Additionally, the lowering of `bswap` is moved into operation legalization, which allows for DAGCombiner to optimize the lowered code.
This commit is contained in:
parent
8947ba017f
commit
81e91ea1c5
@ -713,8 +713,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
|
||||
Custom);
|
||||
}
|
||||
|
||||
setOperationAction(ISD::BSWAP, MVT::i16, Expand);
|
||||
|
||||
setOperationAction(ISD::BR_JT, MVT::Other, Custom);
|
||||
setOperationAction(ISD::BRIND, MVT::Other, Expand);
|
||||
|
||||
@ -1106,6 +1104,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
|
||||
// * MVT::Other - internal.addrspace.wrap
|
||||
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
|
||||
{MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom);
|
||||
|
||||
// Custom lowering for bswap
|
||||
setOperationAction(ISD::BSWAP, {MVT::i16, MVT::i32, MVT::i64, MVT::v2i16},
|
||||
Custom);
|
||||
}
|
||||
|
||||
TargetLoweringBase::LegalizeTypeAction
|
||||
@ -2570,6 +2572,44 @@ static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG) {
|
||||
return Tcgen05StNode;
|
||||
}
|
||||
|
||||
static SDValue lowerBSWAP(SDValue Op, SelectionDAG &DAG) {
|
||||
SDLoc DL(Op);
|
||||
SDValue Src = Op.getOperand(0);
|
||||
EVT VT = Op.getValueType();
|
||||
|
||||
switch (VT.getSimpleVT().SimpleTy) {
|
||||
case MVT::i16: {
|
||||
SDValue Extended = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
|
||||
SDValue Swapped =
|
||||
getPRMT(Extended, DAG.getConstant(0, DL, MVT::i32), 0x7701, DL, DAG);
|
||||
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Swapped);
|
||||
}
|
||||
case MVT::i32: {
|
||||
return getPRMT(Src, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG);
|
||||
}
|
||||
case MVT::v2i16: {
|
||||
SDValue Converted = DAG.getBitcast(MVT::i32, Src);
|
||||
SDValue Swapped =
|
||||
getPRMT(Converted, DAG.getConstant(0, DL, MVT::i32), 0x2301, DL, DAG);
|
||||
return DAG.getNode(ISD::BITCAST, DL, MVT::v2i16, Swapped);
|
||||
}
|
||||
case MVT::i64: {
|
||||
SDValue UnpackSrc =
|
||||
DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, Src);
|
||||
SDValue SwappedLow =
|
||||
getPRMT(UnpackSrc.getValue(0), DAG.getConstant(0, DL, MVT::i32), 0x0123,
|
||||
DL, DAG);
|
||||
SDValue SwappedHigh =
|
||||
getPRMT(UnpackSrc.getValue(1), DAG.getConstant(0, DL, MVT::i32), 0x0123,
|
||||
DL, DAG);
|
||||
return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64,
|
||||
{SwappedHigh, SwappedLow});
|
||||
}
|
||||
default:
|
||||
llvm_unreachable("unsupported type for bswap");
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {
|
||||
switch (IID) {
|
||||
case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
|
||||
@ -3193,7 +3233,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
return lowerCTLZCTPOP(Op, DAG);
|
||||
case ISD::FREM:
|
||||
return lowerFREM(Op, DAG);
|
||||
|
||||
case ISD::BSWAP:
|
||||
return lowerBSWAP(Op, DAG);
|
||||
default:
|
||||
llvm_unreachable("Custom lowering not defined for operation");
|
||||
}
|
||||
|
||||
@ -2468,38 +2468,6 @@ let Predicates = [hasPTX<73>, hasSM<52>] in {
|
||||
|
||||
include "NVPTXIntrinsics.td"
|
||||
|
||||
//-----------------------------------
|
||||
// Notes
|
||||
//-----------------------------------
|
||||
// BSWAP is currently expanded. The following is a more efficient
|
||||
// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
|
||||
// - for sm_20, use pmpt (use vector scalar mov to get the pack and
|
||||
// unpack). sm_20 supports native 32-bit register, but not native 16-bit
|
||||
// register.
|
||||
|
||||
def : Pat <
|
||||
(i32 (bswap i32:$a)),
|
||||
(PRMT_B32rii $a, (i32 0), (i32 0x0123), PrmtNONE)>;
|
||||
|
||||
def : Pat <
|
||||
(v2i16 (bswap v2i16:$a)),
|
||||
(PRMT_B32rii $a, (i32 0), (i32 0x2301), PrmtNONE)>;
|
||||
|
||||
def : Pat <
|
||||
(i64 (bswap i64:$a)),
|
||||
(V2I32toI64
|
||||
(PRMT_B32rii (I64toI32H_Sink $a), (i32 0), (i32 0x0123), PrmtNONE),
|
||||
(PRMT_B32rii (I64toI32L_Sink $a), (i32 0), (i32 0x0123), PrmtNONE))>,
|
||||
Requires<[hasPTX<71>]>;
|
||||
|
||||
// Fall back to the old way if we don't have PTX 7.1.
|
||||
def : Pat <
|
||||
(i64 (bswap i64:$a)),
|
||||
(V2I32toI64
|
||||
(PRMT_B32rii (I64toI32H $a), (i32 0), (i32 0x0123), PrmtNONE),
|
||||
(PRMT_B32rii (I64toI32L $a), (i32 0), (i32 0x0123), PrmtNONE))>;
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// PTX Fence instructions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -1,25 +1,18 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | FileCheck -check-prefixes CHECK,PTX70 %s
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
|
||||
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
|
||||
; RUN: %if ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %}
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | FileCheck -check-prefixes CHECK,PTX71 %s
|
||||
; RUN: %if ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}
|
||||
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
define i16 @bswap16(i16 %a) {
|
||||
; CHECK-LABEL: bswap16(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b16 %rs<5>;
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [bswap16_param_0];
|
||||
; CHECK-NEXT: shr.u16 %rs2, %rs1, 8;
|
||||
; CHECK-NEXT: shl.b16 %rs3, %rs1, 8;
|
||||
; CHECK-NEXT: or.b16 %rs4, %rs3, %rs2;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ld.param.b16 %r1, [bswap16_param_0];
|
||||
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7701U;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
%b = tail call i16 @llvm.bswap.i16(i16 %a)
|
||||
ret i16 %b
|
||||
@ -56,40 +49,39 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
|
||||
}
|
||||
|
||||
define i64 @bswap64(i64 %a) {
|
||||
; PTX70-LABEL: bswap64(
|
||||
; PTX70: {
|
||||
; PTX70-NEXT: .reg .b32 %r<5>;
|
||||
; PTX70-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX70-EMPTY:
|
||||
; PTX70-NEXT: // %bb.0:
|
||||
; PTX70-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
|
||||
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
|
||||
; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 0x123U;
|
||||
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
|
||||
; PTX70-NEXT: prmt.b32 %r4, %r3, 0, 0x123U;
|
||||
; PTX70-NEXT: mov.b64 %rd2, {%r4, %r2};
|
||||
; PTX70-NEXT: st.param.b64 [func_retval0], %rd2;
|
||||
; PTX70-NEXT: ret;
|
||||
;
|
||||
; PTX71-LABEL: bswap64(
|
||||
; PTX71: {
|
||||
; PTX71-NEXT: .reg .b32 %r<5>;
|
||||
; PTX71-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX71-EMPTY:
|
||||
; PTX71-NEXT: // %bb.0:
|
||||
; PTX71-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
|
||||
; PTX71-NEXT: mov.b64 {%r1, _}, %rd1;
|
||||
; PTX71-NEXT: prmt.b32 %r2, %r1, 0, 0x123U;
|
||||
; PTX71-NEXT: mov.b64 {_, %r3}, %rd1;
|
||||
; PTX71-NEXT: prmt.b32 %r4, %r3, 0, 0x123U;
|
||||
; PTX71-NEXT: mov.b64 %rd2, {%r4, %r2};
|
||||
; PTX71-NEXT: st.param.b64 [func_retval0], %rd2;
|
||||
; PTX71-NEXT: ret;
|
||||
; CHECK-LABEL: bswap64(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
|
||||
; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
|
||||
; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x123U;
|
||||
; CHECK-NEXT: prmt.b32 %r4, %r2, 0, 0x123U;
|
||||
; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3};
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
|
||||
; CHECK-NEXT: ret;
|
||||
%b = tail call i64 @llvm.bswap.i64(i64 %a)
|
||||
ret i64 %b
|
||||
}
|
||||
|
||||
define <2 x i32> @bswapv2i32(<2 x i32> %a) {
|
||||
; CHECK-LABEL: bswapv2i32(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [bswapv2i32_param_0];
|
||||
; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x123U;
|
||||
; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x123U;
|
||||
; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
|
||||
; CHECK-NEXT: ret;
|
||||
%b = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a)
|
||||
ret <2 x i32> %b
|
||||
}
|
||||
declare i16 @llvm.bswap.i16(i16)
|
||||
declare i32 @llvm.bswap.i32(i32)
|
||||
declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>)
|
||||
declare i64 @llvm.bswap.i64(i64)
|
||||
declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user