[AMDGPU][CodeGen][True16] Track waitcnt as vgpr32 instead of vgpr16 for D16 Instructions in GFX11 (#157795)
It seems the VMEM access on hi/lo half could interfere the other half. Track waitcnt of vgpr32 instead of vgpr16 for 16bit reg in GFX11. --------- Co-authored-by: Joe Nash <joseph.nash@amd.com>
This commit is contained in:
parent
d6315a260b
commit
2b2b580c8d
@ -580,6 +580,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
|
||||
"Use true 16-bit registers"
|
||||
>;
|
||||
|
||||
def FeatureD16Writes32BitVgpr : SubtargetFeature<"d16-write-vgpr32",
|
||||
"EnableD16Writes32BitVgpr",
|
||||
"true",
|
||||
"D16 instructions potentially have 32-bit data dependencies"
|
||||
>;
|
||||
|
||||
def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
|
||||
"HasBF16TransInsts",
|
||||
"true",
|
||||
@ -1928,7 +1934,9 @@ def FeatureISAVersion11_Common : FeatureSet<
|
||||
FeaturePackedTID,
|
||||
FeatureVcmpxPermlaneHazard,
|
||||
FeatureMemoryAtomicFAddF32DenormalSupport,
|
||||
FeatureRealTrue16Insts]>;
|
||||
FeatureRealTrue16Insts,
|
||||
FeatureD16Writes32BitVgpr,
|
||||
]>;
|
||||
|
||||
// There are few workarounds that need to be
|
||||
// added to all targets. This pessimizes codegen
|
||||
@ -2563,6 +2571,11 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
|
||||
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
|
||||
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
|
||||
|
||||
def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">,
|
||||
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>;
|
||||
def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">,
|
||||
AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, (not FeatureD16Writes32BitVgpr))>;
|
||||
|
||||
def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
|
||||
AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
|
||||
|
||||
|
||||
@ -38,6 +38,10 @@ bool AMDGPUSubtarget::useRealTrue16Insts() const {
|
||||
return hasTrue16BitInsts() && EnableRealTrue16Insts;
|
||||
}
|
||||
|
||||
bool AMDGPUSubtarget::hasD16Writes32BitVgpr() const {
|
||||
return EnableD16Writes32BitVgpr;
|
||||
}
|
||||
|
||||
// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
|
||||
// allows the given function to achieve an occupancy of NWaves waves per
|
||||
// SIMD / EU, taking into account only the function's *maximum* workgroup size.
|
||||
|
||||
@ -59,6 +59,7 @@ protected:
|
||||
bool HasCvtPkF16F32Inst = false;
|
||||
bool HasF32ToF16BF16ConversionSRInsts = false;
|
||||
bool EnableRealTrue16Insts = false;
|
||||
bool EnableD16Writes32BitVgpr = false;
|
||||
bool HasBF16TransInsts = false;
|
||||
bool HasBF16ConversionInsts = false;
|
||||
bool HasBF16PackedInsts = false;
|
||||
@ -224,6 +225,8 @@ public:
|
||||
// supported and the support for fake True16 instructions is removed.
|
||||
bool useRealTrue16Insts() const;
|
||||
|
||||
bool hasD16Writes32BitVgpr() const;
|
||||
|
||||
bool hasBF16TransInsts() const { return HasBF16TransInsts; }
|
||||
|
||||
bool hasBF16ConversionInsts() const {
|
||||
|
||||
@ -845,6 +845,15 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
|
||||
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
|
||||
assert(Size % 16 == 0);
|
||||
Result.second = Result.first + (Size / 16);
|
||||
|
||||
if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) {
|
||||
// Regardless of which lo16/hi16 is used, consider the full 32-bit
|
||||
// register used.
|
||||
if (AMDGPU::isHi16Reg(MCReg, *TRI))
|
||||
Result.first -= 1;
|
||||
else
|
||||
Result.second += 1;
|
||||
}
|
||||
} else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
|
||||
// SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
|
||||
// sources like SRC_PRIVATE_BASE.
|
||||
|
||||
@ -15503,59 +15503,37 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
|
||||
@ -52226,59 +52204,37 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
|
||||
@ -87002,59 +86958,37 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
|
||||
@ -121707,59 +121641,37 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
|
||||
@ -147524,6 +147436,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l
|
||||
@ -147555,7 +147468,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h
|
||||
@ -147572,69 +147484,37 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l
|
||||
@ -147648,7 +147528,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB88_4
|
||||
; GFX11-TRUE16-NEXT: .LBB88_2: ; %end
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX11-TRUE16-NEXT: .LBB88_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h
|
||||
@ -147667,7 +147546,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l
|
||||
@ -147988,10 +147866,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
@ -148008,10 +147884,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l
|
||||
@ -148019,7 +147893,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h
|
||||
@ -148031,10 +147904,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
@ -148051,10 +147922,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
@ -148068,17 +147937,14 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v83.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
@ -148096,10 +147962,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
@ -173957,6 +173821,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l
|
||||
@ -173988,7 +173853,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h
|
||||
@ -174005,69 +173869,37 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l
|
||||
@ -174081,7 +173913,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB92_4
|
||||
; GFX11-TRUE16-NEXT: .LBB92_2: ; %end
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX11-TRUE16-NEXT: .LBB92_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h
|
||||
@ -174100,7 +173931,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l
|
||||
@ -174421,10 +174251,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
@ -174441,10 +174269,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l
|
||||
@ -174452,7 +174278,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h
|
||||
@ -174464,10 +174289,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
@ -174484,10 +174307,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
@ -174501,17 +174322,14 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v83.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
@ -174529,10 +174347,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
@ -196529,6 +196345,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l
|
||||
@ -196560,7 +196377,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h
|
||||
@ -196577,69 +196393,37 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l
|
||||
@ -196653,7 +196437,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB96_4
|
||||
; GFX11-TRUE16-NEXT: .LBB96_2: ; %end
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX11-TRUE16-NEXT: .LBB96_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h
|
||||
@ -196672,7 +196455,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l
|
||||
@ -196993,10 +196775,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
@ -197013,10 +196793,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l
|
||||
@ -197024,7 +196802,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h
|
||||
@ -197036,10 +196813,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
@ -197056,10 +196831,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
@ -197073,17 +196846,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v83.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
@ -197101,10 +196871,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
|
||||
@ -5033,6 +5033,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
|
||||
@ -5059,15 +5060,10 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
|
||||
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
|
||||
@ -11993,6 +11989,7 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
|
||||
@ -12019,15 +12016,10 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
|
||||
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
|
||||
@ -18559,6 +18551,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
|
||||
@ -18596,13 +18589,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
|
||||
@ -18701,10 +18690,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
|
||||
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
|
||||
@ -24640,6 +24628,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
|
||||
@ -24677,13 +24666,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
|
||||
@ -24782,10 +24767,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2
|
||||
; GFX11-TRUE16-NEXT: .LBB62_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
|
||||
@ -28760,6 +28744,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
|
||||
@ -28792,15 +28777,10 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
|
||||
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
|
||||
@ -32871,6 +32851,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
|
||||
@ -32903,15 +32884,10 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
|
||||
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
|
||||
|
||||
@ -12492,6 +12492,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
|
||||
@ -12523,39 +12524,22 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
|
||||
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
|
||||
@ -27377,6 +27361,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
|
||||
@ -27408,39 +27393,22 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
|
||||
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
|
||||
@ -41534,6 +41502,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
|
||||
@ -41565,39 +41534,22 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
|
||||
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
|
||||
@ -54837,6 +54789,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
|
||||
@ -54868,39 +54821,22 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
|
||||
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
|
||||
@ -68501,6 +68437,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l
|
||||
@ -68533,37 +68470,24 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l
|
||||
@ -68710,6 +68634,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
|
||||
; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
|
||||
@ -68717,7 +68642,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
@ -68732,11 +68656,10 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
@ -68756,7 +68679,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
|
||||
@ -80726,6 +80648,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l
|
||||
@ -80758,37 +80681,24 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l
|
||||
@ -80935,6 +80845,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2
|
||||
; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
|
||||
@ -80942,7 +80853,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
@ -80957,11 +80867,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
@ -80981,7 +80890,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
|
||||
@ -91233,6 +91141,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l
|
||||
@ -91265,37 +91174,24 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l
|
||||
@ -91442,6 +91338,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2
|
||||
; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
|
||||
@ -91449,7 +91346,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
@ -91464,11 +91360,10 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
@ -91488,7 +91383,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
|
||||
|
||||
@ -23,9 +23,9 @@ define amdgpu_kernel void @long_forward_branch_gfx11plus(ptr addrspace(1) %in, p
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: global_load_d16_b16 v0, v1, s[0:1]
|
||||
; GFX11-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
|
||||
; GFX11-NEXT: global_store_d16_hi_b16 v1, v0, s[2:3] offset:2
|
||||
; GFX11-NEXT: .LBB0_2: ; %bb3
|
||||
; GFX11-NEXT: s_endpgm
|
||||
|
||||
@ -1110,6 +1110,7 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, pt
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x7b
|
||||
; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v2, v0
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: ds_store_b16 v1, v2
|
||||
; GFX11-TRUE16-NEXT: ds_load_u16_d16 v2, v0 offset:2
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
||||
@ -1561,8 +1561,8 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 clamp
|
||||
|
||||
@ -1685,19 +1685,18 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v5
|
||||
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
|
||||
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v5, 0, 8
|
||||
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
|
||||
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-DL-TRUE16-NEXT: v_perm_b32 v1, v5, v5, 0xc0c0302
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v3.l, v0.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_perm_b32 v2, v4, v4, 0xc0c0302
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-DL-TRUE16-NEXT: v_dot4_u32_u8 v0, v2, v1, v0
|
||||
; GFX11-DL-TRUE16-NEXT: global_store_b16 v6, v0, s[4:5]
|
||||
; GFX11-DL-TRUE16-NEXT: s_endpgm
|
||||
@ -1977,13 +1976,12 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
|
||||
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v4, 0, 8
|
||||
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h
|
||||
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
||||
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
|
||||
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v3
|
||||
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.h
|
||||
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l
|
||||
@ -2726,10 +2724,10 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v3
|
||||
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v4
|
||||
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.h, 8, v4.l
|
||||
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v2.l, v2.l, v6.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
|
||||
|
||||
@ -376,9 +376,8 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off offset:48
|
||||
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off offset:32
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, v0.h, v0.l
|
||||
|
||||
@ -461,9 +461,8 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off offset:48
|
||||
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off offset:32
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b16 v1.l, v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.h, v0.l
|
||||
|
||||
@ -1528,8 +1528,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 64
|
||||
; GFX11-SDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_endpgm
|
||||
@ -1559,8 +1559,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xffc0, v0.l
|
||||
; GFX11-GISEL-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
|
||||
|
||||
@ -61,8 +61,8 @@ define void @spill_i16_alu_two_vals() {
|
||||
; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc
|
||||
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload
|
||||
; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
|
||||
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
|
||||
; GCN-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 dlc
|
||||
; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user