AMDGPU: Directly use v2bf16 as register type for bf16 vectors. (#175761)
Previously we were casting v2bf16 to i32, unlike the f16 case. Simplify this by using the natural vector type. This is probably a leftover from before v2bf16 was treated as legal. This is preparation for fixing a miscompile in globalisel.
This commit is contained in:
parent
5cb4d32e3d
commit
2e0e4f6cb3
@ -1111,11 +1111,8 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
|
||||
EVT ScalarVT = VT.getScalarType();
|
||||
unsigned Size = ScalarVT.getSizeInBits();
|
||||
if (Size == 16) {
|
||||
if (Subtarget->has16BitInsts()) {
|
||||
if (VT.isInteger())
|
||||
return MVT::v2i16;
|
||||
return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
|
||||
}
|
||||
if (Subtarget->has16BitInsts())
|
||||
return MVT::getVectorVT(ScalarVT.getSimpleVT(), 2);
|
||||
return VT.isInteger() ? MVT::i32 : MVT::f32;
|
||||
}
|
||||
|
||||
@ -1167,13 +1164,8 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
|
||||
// support, but unless we can properly handle 3-vectors, it will be still be
|
||||
// inconsistent.
|
||||
if (Size == 16 && Subtarget->has16BitInsts()) {
|
||||
if (ScalarVT == MVT::bf16) {
|
||||
RegisterVT = MVT::i32;
|
||||
IntermediateVT = MVT::v2bf16;
|
||||
} else {
|
||||
RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
|
||||
IntermediateVT = RegisterVT;
|
||||
}
|
||||
RegisterVT = MVT::getVectorVT(ScalarVT.getSimpleVT(), 2);
|
||||
IntermediateVT = RegisterVT;
|
||||
NumIntermediates = (NumElts + 1) / 2;
|
||||
return NumIntermediates;
|
||||
}
|
||||
|
||||
@ -8,24 +8,20 @@ define <3 x bfloat> @v3bf16(<3 x bfloat> %arg0) {
|
||||
; GFX9: bb.1 (%ir-block.0):
|
||||
; GFX9-NEXT: liveins: $vgpr0, $vgpr1
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
|
||||
; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
|
||||
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
|
||||
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<3 x s16>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>)
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>)
|
||||
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16)
|
||||
; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
|
||||
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16)
|
||||
; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<3 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<3 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2)
|
||||
; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<3 x s16>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<3 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2)
|
||||
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<3 x s16>)
|
||||
; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT4]](s32)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT5]](s32)
|
||||
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
|
||||
; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[DEF]](s16)
|
||||
; GFX9-NEXT: [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<4 x s16>)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[UV7]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[UV8]](<2 x s16>)
|
||||
; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
|
||||
%res = shufflevector <3 x bfloat> %arg0, <3 x bfloat> zeroinitializer, <3 x i32> <i32 3, i32 1, i32 2>
|
||||
ret <3 x bfloat> %res
|
||||
@ -36,24 +32,15 @@ define <4 x bfloat> @v4bf16(<4 x bfloat> %arg0) {
|
||||
; GFX9: bb.1 (%ir-block.0):
|
||||
; GFX9-NEXT: liveins: $vgpr0, $vgpr1
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
|
||||
; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
|
||||
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32)
|
||||
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
|
||||
; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
|
||||
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
|
||||
; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<4 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0)
|
||||
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<4 x s16>)
|
||||
; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT4]](s32)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT5]](s32)
|
||||
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
|
||||
; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s16>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<4 x s16>), [[BUILD_VECTOR]], shufflemask(3, 1, 2, 0)
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[SHUF]](<4 x s16>)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
|
||||
; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
|
||||
%res = shufflevector <4 x bfloat> %arg0, <4 x bfloat> zeroinitializer, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
|
||||
ret <4 x bfloat> %res
|
||||
@ -64,30 +51,22 @@ define <5 x bfloat> @v5bf16(<5 x bfloat> %arg0) {
|
||||
; GFX9: bb.1 (%ir-block.0):
|
||||
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
|
||||
; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
|
||||
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
|
||||
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32)
|
||||
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<5 x s16>) = G_TRUNC [[BUILD_VECTOR]](<5 x s32>)
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>)
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
|
||||
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16)
|
||||
; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
|
||||
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<5 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
|
||||
; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<5 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<5 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4)
|
||||
; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<5 x s16>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<5 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4)
|
||||
; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<5 x s16>)
|
||||
; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT6]](s32)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT7]](s32)
|
||||
; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT8]](s32)
|
||||
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
|
||||
; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[UV6]](s16), [[UV7]](s16), [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[DEF]](s16)
|
||||
; GFX9-NEXT: [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<6 x s16>)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[UV11]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[UV12]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr2 = COPY [[UV13]](<2 x s16>)
|
||||
; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
|
||||
%res = shufflevector <5 x bfloat> %arg0, <5 x bfloat> zeroinitializer, <5 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4>
|
||||
ret <5 x bfloat> %res
|
||||
@ -98,30 +77,17 @@ define <6 x bfloat> @v6bf16(<6 x bfloat> %arg0) {
|
||||
; GFX9: bb.1 (%ir-block.0):
|
||||
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
|
||||
; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
|
||||
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
|
||||
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32)
|
||||
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<6 x s16>) = G_TRUNC [[BUILD_VECTOR]](<6 x s32>)
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>)
|
||||
; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
|
||||
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
|
||||
; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<6 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<6 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5)
|
||||
; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<6 x s16>)
|
||||
; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT6]](s32)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT7]](s32)
|
||||
; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT8]](s32)
|
||||
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
|
||||
; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<6 x s16>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<6 x s16>), [[BUILD_VECTOR]], shufflemask(3, 1, 2, 0, 4, 5)
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[SHUF]](<6 x s16>)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr2 = COPY [[UV2]](<2 x s16>)
|
||||
; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
|
||||
%res = shufflevector <6 x bfloat> %arg0, <6 x bfloat> zeroinitializer, <6 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5>
|
||||
ret <6 x bfloat> %res
|
||||
@ -132,36 +98,24 @@ define <7 x bfloat> @v7bf16(<7 x bfloat> %arg0) {
|
||||
; GFX9: bb.1 (%ir-block.0):
|
||||
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
|
||||
; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
|
||||
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
|
||||
; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
|
||||
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32)
|
||||
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<7 x s16>) = G_TRUNC [[BUILD_VECTOR]](<7 x s32>)
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
|
||||
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s16>)
|
||||
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16)
|
||||
; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
|
||||
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<7 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
|
||||
; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<7 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<7 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5, 6)
|
||||
; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<7 x s16>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<7 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5, 6)
|
||||
; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<7 x s16>)
|
||||
; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT8]](s32)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT9]](s32)
|
||||
; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT10]](s32)
|
||||
; GFX9-NEXT: $vgpr3 = COPY [[ANYEXT11]](s32)
|
||||
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
|
||||
; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[UV11]](s16), [[UV12]](s16), [[UV13]](s16), [[UV14]](s16), [[DEF]](s16)
|
||||
; GFX9-NEXT: [[UV15:%[0-9]+]]:_(<2 x s16>), [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>), [[UV18:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<8 x s16>)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[UV15]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[UV16]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr2 = COPY [[UV17]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr3 = COPY [[UV18]](<2 x s16>)
|
||||
; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
|
||||
%res = shufflevector <7 x bfloat> %arg0, <7 x bfloat> zeroinitializer, <7 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6>
|
||||
ret <7 x bfloat> %res
|
||||
@ -172,36 +126,19 @@ define <8 x bfloat> @v8bf16(<8 x bfloat> %arg0) {
|
||||
; GFX9: bb.1 (%ir-block.0):
|
||||
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
|
||||
; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
|
||||
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
|
||||
; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
|
||||
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32)
|
||||
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s16>) = G_TRUNC [[BUILD_VECTOR]](<8 x s32>)
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
|
||||
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
|
||||
; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
|
||||
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
|
||||
; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<8 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<8 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5, 6, 7)
|
||||
; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<8 x s16>)
|
||||
; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT8]](s32)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT9]](s32)
|
||||
; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT10]](s32)
|
||||
; GFX9-NEXT: $vgpr3 = COPY [[ANYEXT11]](s32)
|
||||
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
|
||||
; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<8 x s16>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<8 x s16>), [[BUILD_VECTOR]], shufflemask(3, 1, 2, 0, 4, 5, 6, 7)
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[SHUF]](<8 x s16>)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr2 = COPY [[UV2]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr3 = COPY [[UV3]](<2 x s16>)
|
||||
; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
|
||||
%res = shufflevector <8 x bfloat> %arg0, <8 x bfloat> zeroinitializer, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x bfloat> %res
|
||||
@ -212,57 +149,24 @@ define <16 x bfloat> @v16bf16(<16 x bfloat> %arg0) {
|
||||
; GFX9: bb.1 (%ir-block.0):
|
||||
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
|
||||
; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
|
||||
; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
|
||||
; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
|
||||
; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
|
||||
; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
|
||||
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
|
||||
; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
|
||||
; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY4]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
|
||||
; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY5]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
|
||||
; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY6]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s16)
|
||||
; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY7]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s16)
|
||||
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32), [[ANYEXT8]](s32), [[ANYEXT9]](s32), [[ANYEXT10]](s32), [[ANYEXT11]](s32), [[ANYEXT12]](s32), [[ANYEXT13]](s32), [[ANYEXT14]](s32), [[ANYEXT15]](s32)
|
||||
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<16 x s16>) = G_TRUNC [[BUILD_VECTOR]](<16 x s32>)
|
||||
; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s16), [[UV17:%[0-9]+]]:_(s16), [[UV18:%[0-9]+]]:_(s16), [[UV19:%[0-9]+]]:_(s16), [[UV20:%[0-9]+]]:_(s16), [[UV21:%[0-9]+]]:_(s16), [[UV22:%[0-9]+]]:_(s16), [[UV23:%[0-9]+]]:_(s16), [[UV24:%[0-9]+]]:_(s16), [[UV25:%[0-9]+]]:_(s16), [[UV26:%[0-9]+]]:_(s16), [[UV27:%[0-9]+]]:_(s16), [[UV28:%[0-9]+]]:_(s16), [[UV29:%[0-9]+]]:_(s16), [[UV30:%[0-9]+]]:_(s16), [[UV31:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[TRUNC]](<16 x s16>)
|
||||
; GFX9-NEXT: [[ANYEXT16:%[0-9]+]]:_(s32) = G_ANYEXT [[UV16]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[UV17]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT18:%[0-9]+]]:_(s32) = G_ANYEXT [[UV18]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT19:%[0-9]+]]:_(s32) = G_ANYEXT [[UV19]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT20:%[0-9]+]]:_(s32) = G_ANYEXT [[UV20]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT21:%[0-9]+]]:_(s32) = G_ANYEXT [[UV21]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT22:%[0-9]+]]:_(s32) = G_ANYEXT [[UV22]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT23:%[0-9]+]]:_(s32) = G_ANYEXT [[UV23]](s16)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT16]](s32)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT17]](s32)
|
||||
; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT18]](s32)
|
||||
; GFX9-NEXT: $vgpr3 = COPY [[ANYEXT19]](s32)
|
||||
; GFX9-NEXT: $vgpr4 = COPY [[ANYEXT20]](s32)
|
||||
; GFX9-NEXT: $vgpr5 = COPY [[ANYEXT21]](s32)
|
||||
; GFX9-NEXT: $vgpr6 = COPY [[ANYEXT22]](s32)
|
||||
; GFX9-NEXT: $vgpr7 = COPY [[ANYEXT23]](s32)
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
|
||||
; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
|
||||
; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
|
||||
; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr6
|
||||
; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr7
|
||||
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>)
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr2 = COPY [[UV2]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr3 = COPY [[UV3]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr4 = COPY [[UV4]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr5 = COPY [[UV5]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr6 = COPY [[UV6]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr7 = COPY [[UV7]](<2 x s16>)
|
||||
; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
|
||||
ret <16 x bfloat> %arg0
|
||||
}
|
||||
@ -272,105 +176,40 @@ define <32 x bfloat> @v32bf16(<32 x bfloat> %arg0) {
|
||||
; GFX9: bb.1 (%ir-block.0):
|
||||
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
|
||||
; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
|
||||
; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
|
||||
; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
|
||||
; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
|
||||
; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
|
||||
; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
|
||||
; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
|
||||
; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
|
||||
; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
|
||||
; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
|
||||
; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
|
||||
; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
|
||||
; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
|
||||
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
|
||||
; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
|
||||
; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY4]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
|
||||
; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY5]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
|
||||
; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY6]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s16)
|
||||
; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY7]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s16)
|
||||
; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s16), [[UV17:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY8]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT16:%[0-9]+]]:_(s32) = G_ANYEXT [[UV16]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[UV17]](s16)
|
||||
; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s16), [[UV19:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY9]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT18:%[0-9]+]]:_(s32) = G_ANYEXT [[UV18]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT19:%[0-9]+]]:_(s32) = G_ANYEXT [[UV19]](s16)
|
||||
; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s16), [[UV21:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY10]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT20:%[0-9]+]]:_(s32) = G_ANYEXT [[UV20]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT21:%[0-9]+]]:_(s32) = G_ANYEXT [[UV21]](s16)
|
||||
; GFX9-NEXT: [[UV22:%[0-9]+]]:_(s16), [[UV23:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY11]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT22:%[0-9]+]]:_(s32) = G_ANYEXT [[UV22]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT23:%[0-9]+]]:_(s32) = G_ANYEXT [[UV23]](s16)
|
||||
; GFX9-NEXT: [[UV24:%[0-9]+]]:_(s16), [[UV25:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY12]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT24:%[0-9]+]]:_(s32) = G_ANYEXT [[UV24]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT25:%[0-9]+]]:_(s32) = G_ANYEXT [[UV25]](s16)
|
||||
; GFX9-NEXT: [[UV26:%[0-9]+]]:_(s16), [[UV27:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY13]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT26:%[0-9]+]]:_(s32) = G_ANYEXT [[UV26]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT27:%[0-9]+]]:_(s32) = G_ANYEXT [[UV27]](s16)
|
||||
; GFX9-NEXT: [[UV28:%[0-9]+]]:_(s16), [[UV29:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY14]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT28:%[0-9]+]]:_(s32) = G_ANYEXT [[UV28]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT29:%[0-9]+]]:_(s32) = G_ANYEXT [[UV29]](s16)
|
||||
; GFX9-NEXT: [[UV30:%[0-9]+]]:_(s16), [[UV31:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY15]](s32)
|
||||
; GFX9-NEXT: [[ANYEXT30:%[0-9]+]]:_(s32) = G_ANYEXT [[UV30]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT31:%[0-9]+]]:_(s32) = G_ANYEXT [[UV31]](s16)
|
||||
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32), [[ANYEXT8]](s32), [[ANYEXT9]](s32), [[ANYEXT10]](s32), [[ANYEXT11]](s32), [[ANYEXT12]](s32), [[ANYEXT13]](s32), [[ANYEXT14]](s32), [[ANYEXT15]](s32), [[ANYEXT16]](s32), [[ANYEXT17]](s32), [[ANYEXT18]](s32), [[ANYEXT19]](s32), [[ANYEXT20]](s32), [[ANYEXT21]](s32), [[ANYEXT22]](s32), [[ANYEXT23]](s32), [[ANYEXT24]](s32), [[ANYEXT25]](s32), [[ANYEXT26]](s32), [[ANYEXT27]](s32), [[ANYEXT28]](s32), [[ANYEXT29]](s32), [[ANYEXT30]](s32), [[ANYEXT31]](s32)
|
||||
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<32 x s16>) = G_TRUNC [[BUILD_VECTOR]](<32 x s32>)
|
||||
; GFX9-NEXT: [[UV32:%[0-9]+]]:_(s16), [[UV33:%[0-9]+]]:_(s16), [[UV34:%[0-9]+]]:_(s16), [[UV35:%[0-9]+]]:_(s16), [[UV36:%[0-9]+]]:_(s16), [[UV37:%[0-9]+]]:_(s16), [[UV38:%[0-9]+]]:_(s16), [[UV39:%[0-9]+]]:_(s16), [[UV40:%[0-9]+]]:_(s16), [[UV41:%[0-9]+]]:_(s16), [[UV42:%[0-9]+]]:_(s16), [[UV43:%[0-9]+]]:_(s16), [[UV44:%[0-9]+]]:_(s16), [[UV45:%[0-9]+]]:_(s16), [[UV46:%[0-9]+]]:_(s16), [[UV47:%[0-9]+]]:_(s16), [[UV48:%[0-9]+]]:_(s16), [[UV49:%[0-9]+]]:_(s16), [[UV50:%[0-9]+]]:_(s16), [[UV51:%[0-9]+]]:_(s16), [[UV52:%[0-9]+]]:_(s16), [[UV53:%[0-9]+]]:_(s16), [[UV54:%[0-9]+]]:_(s16), [[UV55:%[0-9]+]]:_(s16), [[UV56:%[0-9]+]]:_(s16), [[UV57:%[0-9]+]]:_(s16), [[UV58:%[0-9]+]]:_(s16), [[UV59:%[0-9]+]]:_(s16), [[UV60:%[0-9]+]]:_(s16), [[UV61:%[0-9]+]]:_(s16), [[UV62:%[0-9]+]]:_(s16), [[UV63:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[TRUNC]](<32 x s16>)
|
||||
; GFX9-NEXT: [[ANYEXT32:%[0-9]+]]:_(s32) = G_ANYEXT [[UV32]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT33:%[0-9]+]]:_(s32) = G_ANYEXT [[UV33]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT34:%[0-9]+]]:_(s32) = G_ANYEXT [[UV34]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT35:%[0-9]+]]:_(s32) = G_ANYEXT [[UV35]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT36:%[0-9]+]]:_(s32) = G_ANYEXT [[UV36]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT37:%[0-9]+]]:_(s32) = G_ANYEXT [[UV37]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT38:%[0-9]+]]:_(s32) = G_ANYEXT [[UV38]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT39:%[0-9]+]]:_(s32) = G_ANYEXT [[UV39]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT40:%[0-9]+]]:_(s32) = G_ANYEXT [[UV40]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT41:%[0-9]+]]:_(s32) = G_ANYEXT [[UV41]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT42:%[0-9]+]]:_(s32) = G_ANYEXT [[UV42]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT43:%[0-9]+]]:_(s32) = G_ANYEXT [[UV43]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT44:%[0-9]+]]:_(s32) = G_ANYEXT [[UV44]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT45:%[0-9]+]]:_(s32) = G_ANYEXT [[UV45]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT46:%[0-9]+]]:_(s32) = G_ANYEXT [[UV46]](s16)
|
||||
; GFX9-NEXT: [[ANYEXT47:%[0-9]+]]:_(s32) = G_ANYEXT [[UV47]](s16)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT32]](s32)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT33]](s32)
|
||||
; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT34]](s32)
|
||||
; GFX9-NEXT: $vgpr3 = COPY [[ANYEXT35]](s32)
|
||||
; GFX9-NEXT: $vgpr4 = COPY [[ANYEXT36]](s32)
|
||||
; GFX9-NEXT: $vgpr5 = COPY [[ANYEXT37]](s32)
|
||||
; GFX9-NEXT: $vgpr6 = COPY [[ANYEXT38]](s32)
|
||||
; GFX9-NEXT: $vgpr7 = COPY [[ANYEXT39]](s32)
|
||||
; GFX9-NEXT: $vgpr8 = COPY [[ANYEXT40]](s32)
|
||||
; GFX9-NEXT: $vgpr9 = COPY [[ANYEXT41]](s32)
|
||||
; GFX9-NEXT: $vgpr10 = COPY [[ANYEXT42]](s32)
|
||||
; GFX9-NEXT: $vgpr11 = COPY [[ANYEXT43]](s32)
|
||||
; GFX9-NEXT: $vgpr12 = COPY [[ANYEXT44]](s32)
|
||||
; GFX9-NEXT: $vgpr13 = COPY [[ANYEXT45]](s32)
|
||||
; GFX9-NEXT: $vgpr14 = COPY [[ANYEXT46]](s32)
|
||||
; GFX9-NEXT: $vgpr15 = COPY [[ANYEXT47]](s32)
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
|
||||
; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
|
||||
; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
|
||||
; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr6
|
||||
; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr7
|
||||
; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr8
|
||||
; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr9
|
||||
; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr10
|
||||
; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr11
|
||||
; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr12
|
||||
; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr13
|
||||
; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr14
|
||||
; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr15
|
||||
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>), [[COPY8]](<2 x s16>), [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[COPY12]](<2 x s16>), [[COPY13]](<2 x s16>), [[COPY14]](<2 x s16>), [[COPY15]](<2 x s16>)
|
||||
; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>), [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>), [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>)
|
||||
; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr2 = COPY [[UV2]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr3 = COPY [[UV3]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr4 = COPY [[UV4]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr5 = COPY [[UV5]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr6 = COPY [[UV6]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr7 = COPY [[UV7]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr8 = COPY [[UV8]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr9 = COPY [[UV9]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr10 = COPY [[UV10]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr11 = COPY [[UV11]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr12 = COPY [[UV12]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr13 = COPY [[UV13]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr14 = COPY [[UV14]](<2 x s16>)
|
||||
; GFX9-NEXT: $vgpr15 = COPY [[UV15]](<2 x s16>)
|
||||
; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
|
||||
ret <32 x bfloat> %arg0
|
||||
}
|
||||
|
||||
@ -3021,10 +3021,9 @@ define void @void_func_v2bf16_inreg(<2 x bfloat> inreg %arg0) #0 {
|
||||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK-NEXT: liveins: $sgpr16
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
|
||||
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY]](s32)
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $sgpr16
|
||||
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
|
||||
; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s16>), [[DEF]](p1) :: (store (<2 x s16>) into `ptr addrspace(1) poison`, addrspace 1)
|
||||
; CHECK-NEXT: G_STORE [[COPY]](<2 x s16>), [[DEF]](p1) :: (store (<2 x s16>) into `ptr addrspace(1) poison`, addrspace 1)
|
||||
; CHECK-NEXT: SI_RETURN
|
||||
store <2 x bfloat> %arg0, ptr addrspace(1) poison
|
||||
ret void
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -2403,7 +2403,6 @@ define amdgpu_vs <2 x bfloat> @load_v2bf16(ptr addrspace(6) inreg %p0, ptr addrs
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: load_v2bf16:
|
||||
@ -2438,7 +2437,6 @@ define amdgpu_vs <2 x bfloat> @load_v2bf16(ptr addrspace(6) inreg %p0, ptr addrs
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; GFX9-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
%gep1 = getelementptr inbounds <2 x bfloat>, ptr addrspace(6) %p1, i32 2
|
||||
%r0 = load <2 x bfloat>, ptr addrspace(6) %p0
|
||||
|
||||
@ -515,36 +515,37 @@ define <2 x bfloat> @v_exp2_fabs_v2bf16(<2 x bfloat> %in) {
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_dual_add_f32 v0, v0, v3 :: v_dual_lshlrev_b32 v1, 16, v1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(TRANS32_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.l
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v2, v1, v2
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.h
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, s0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v2
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v2
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
||||
@ -561,68 +562,66 @@ define <2 x bfloat> @v_exp2_fabs_v2bf16(<2 x bfloat> %in) {
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0x7fff, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v1, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v1, v1, v3
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v2
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_lshlrev_b32 v1, 16, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v1, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v1, v1, v3
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fabs_v2bf16:
|
||||
; GFX1250-SDAG-TRUE16: ; %bb.0:
|
||||
; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
|
||||
; GFX1250-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15
|
||||
; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e32 v0.l, v1.l
|
||||
; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e32 v0.h, v2.l
|
||||
; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.h, |v0.h|
|
||||
; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.l, |v0.l|
|
||||
; GFX1250-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
|
||||
;
|
||||
; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fabs_v2bf16:
|
||||
; GFX1250-SDAG-FAKE16: ; %bb.0:
|
||||
; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15
|
||||
; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e32 v1, v1
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e32 v0, v0
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v0, |v0|
|
||||
; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v1, |v1|
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_nop
|
||||
; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
|
||||
%fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
|
||||
%result = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %fabs)
|
||||
@ -637,47 +636,44 @@ define <2 x bfloat> @v_exp2_fneg_fabs_v2bf16(<2 x bfloat> %in) {
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 15
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v2.h, 0x8000, v1.l
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v2
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_or_b16 v1.h, 0x8000, v0.l
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, vcc_lo
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v1, v2, v1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v2.h, 0x8000, v0.l
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v2
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v2, v1, v2
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_or_b16 v1.h, 0x8000, v0.h
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, s0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v3
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v2, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, s0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v2
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v2
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v2
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_v2bf16:
|
||||
@ -687,73 +683,66 @@ define <2 x bfloat> @v_exp2_fneg_fabs_v2bf16(<2 x bfloat> %in) {
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_dual_add_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v3
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v1, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v1, v1, v2
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(TRANS32_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v1, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v1, v1, v2
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v1, v1, v3
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v2
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
|
||||
; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fneg_fabs_v2bf16:
|
||||
; GFX1250-SDAG-TRUE16: ; %bb.0:
|
||||
; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
|
||||
; GFX1250-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15
|
||||
; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.l, -v1.l
|
||||
; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.h, -v2.l
|
||||
; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.h, -|v0.h|
|
||||
; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.l, -|v0.l|
|
||||
; GFX1250-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
|
||||
;
|
||||
; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_v2bf16:
|
||||
; GFX1250-SDAG-FAKE16: ; %bb.0:
|
||||
; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15
|
||||
; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v1, -v1
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v0, -v0
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v0, -|v0|
|
||||
; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v1, -|v1|
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_nop
|
||||
; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
|
||||
; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
|
||||
%fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
|
||||
%fneg.fabs = fneg <2 x bfloat> %fabs
|
||||
|
||||
@ -127,25 +127,20 @@ define <2 x bfloat> @v_log2_fabs_v2bf16(<2 x bfloat> %in) {
|
||||
; GFX-SDAG-TRUE16: ; %bb.0:
|
||||
; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
|
||||
; GFX-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15
|
||||
; GFX-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v1.l
|
||||
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.h, v2.l
|
||||
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.h, |v0.h|
|
||||
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, |v0.l|
|
||||
; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
|
||||
;
|
||||
; GFX-SDAG-FAKE16-LABEL: v_log2_fabs_v2bf16:
|
||||
; GFX-SDAG-FAKE16: ; %bb.0:
|
||||
; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
|
||||
; GFX-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15
|
||||
; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v1, v1
|
||||
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0
|
||||
; GFX-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, |v0|
|
||||
; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
|
||||
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v1, |v1|
|
||||
; GFX-SDAG-FAKE16-NEXT: v_nop
|
||||
; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
|
||||
; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
|
||||
; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
|
||||
%fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
|
||||
%result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fabs)
|
||||
@ -157,25 +152,20 @@ define <2 x bfloat> @v_log2_fneg_fabs_v2bf16(<2 x bfloat> %in) {
|
||||
; GFX-SDAG-TRUE16: ; %bb.0:
|
||||
; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
|
||||
; GFX-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15
|
||||
; GFX-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -v1.l
|
||||
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.h, -v2.l
|
||||
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.h, -|v0.h|
|
||||
; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -|v0.l|
|
||||
; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
|
||||
;
|
||||
; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_v2bf16:
|
||||
; GFX-SDAG-FAKE16: ; %bb.0:
|
||||
; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
|
||||
; GFX-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15
|
||||
; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v1, -v1
|
||||
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -v0
|
||||
; GFX-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -|v0|
|
||||
; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
|
||||
; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v1, -|v1|
|
||||
; GFX-SDAG-FAKE16-NEXT: v_nop
|
||||
; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
|
||||
; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
|
||||
; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
|
||||
%fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
|
||||
%fneg.fabs = fneg <2 x bfloat> %fabs
|
||||
|
||||
@ -10098,21 +10098,20 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v55, off, s32
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v15 :: v_dual_mov_b32 v48, v13
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v37, v12
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v10 :: v_dual_mov_b32 v50, v8
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v9 :: v_dual_and_b32 v8, 0xffff0000, v53
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v31, v10
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v33, v8
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_mov_b32 v34, v11
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_and_b32 v8, 0xffff0000, v53
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v11 :: v_dual_mov_b32 v37, v12
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v36, v9
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v51
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v23
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v23
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v22
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v48
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v19
|
||||
@ -10180,7 +10179,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v34.h, v27.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v26
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v39
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v36
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v27.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v8
|
||||
@ -10194,7 +10193,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v31.h, v26.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v25
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v50
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v33
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v26.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v8
|
||||
@ -10205,7 +10204,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v8.h, v54.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v39.h, v25.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v36.h, v25.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v25.h, v54.h, vcc_lo
|
||||
@ -10217,7 +10216,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v8.h, v54.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v50.h, v24.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v33.h, v24.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v7
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v24.h, v54.h, vcc_lo
|
||||
@ -10229,39 +10228,41 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v54.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v7.h, v23.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v6
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v6
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v23.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v32.h, v54.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v6.h, v22.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v4
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v32.h, v54.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.h, v32.h, v54.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v6.h, v22.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.h, v32.h, v54.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v5.h, v21.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v54.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v21.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
|
||||
@ -10421,7 +10422,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v54.h, v28.h, s0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v26.l, v27.h, s2
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v39
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v36
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v27.h
|
||||
; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v54
|
||||
@ -10431,12 +10432,12 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v54.l
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v39.l, v25.l, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v36.l, v25.l, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v54.h, v27.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v25.l, v26.h, s2
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v50
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v33
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v24
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v26.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
@ -10447,7 +10448,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v54.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v50.l, v24.l, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v33.l, v24.l, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v54.h, v26.h, s0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
@ -10480,7 +10481,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v54.l
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v54.h, v7.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v54.h, v7.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v22.l, v6.h, s2
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v21
|
||||
@ -10496,13 +10497,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v21.l, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v54.l
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v54.h, v6.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v54.h, v6.h, s0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.l, v5.h, s2
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v33
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v50
|
||||
; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v54
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v5.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
|
||||
@ -10518,7 +10519,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v36
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v39
|
||||
; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v54
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v4.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
|
||||
@ -11168,21 +11169,20 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: scratch_load_b32 v55, off, s32
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v53, v15 :: v_dual_mov_b32 v48, v13
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v50, v8
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v31, v10 :: v_dual_and_b32 v8, 0xffff0000, v53
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v39, v9
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v31, v10
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v33, v8
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_mov_b32 v34, v11
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_and_b32 v8, 0xffff0000, v53
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v34, v11 :: v_dual_mov_b32 v37, v12
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v36, v9
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v51
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v23
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v23
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v22
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v48
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v19
|
||||
@ -11260,7 +11260,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v34.h, v27.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v26
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v39
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v36
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v27.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
||||
@ -11276,7 +11276,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v31.h, v26.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v25
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v50
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v33
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v26.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
||||
@ -11289,7 +11289,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v8.h, v54.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v39.h, v25.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v36.h, v25.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
@ -11304,7 +11304,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v8.h, v54.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v50.h, v24.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v33.h, v24.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v7
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
@ -11318,32 +11318,16 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v54.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v7.h, v23.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v6
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v6
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v23.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v32.h, v54.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v6.h, v22.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v4
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
@ -11351,20 +11335,36 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v32.h, v54.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v5.h, v21.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.h, v32.h, v54.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v6.h, v22.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v35.l, v54.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v4
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v21.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.h, v32.h, v54.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v5.h, v21.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v35.l, v54.l
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v21.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
|
||||
@ -11548,7 +11548,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v54.h, v28.h, s0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v26.l, v27.h, s2
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v39
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v36
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v27.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v54
|
||||
@ -11560,13 +11560,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v54.l
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v39.l, v25.l, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v36.l, v25.l, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v54.h, v27.h, s0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v25.l, v26.h, s2
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v50
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v33
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v24
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v26.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v26, v54
|
||||
@ -11578,7 +11578,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v54.l
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v50.l, v24.l, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v33.l, v24.l, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v54.h, v26.h, s0
|
||||
@ -11616,7 +11616,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v54.l
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v54.h, v7.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v54.h, v7.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v22.l, v6.h, s2
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v21
|
||||
@ -11635,12 +11635,12 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v54.l
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v54.h, v6.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v54.h, v6.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.l, v5.h, s2
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v33
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v50
|
||||
; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v54
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v5.h, vcc_lo
|
||||
@ -11658,7 +11658,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v36
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v39
|
||||
; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v54
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v4.h, vcc_lo
|
||||
|
||||
@ -10131,21 +10131,20 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v55, off, s32
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v15 :: v_dual_mov_b32 v48, v13
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v37, v12
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v10 :: v_dual_mov_b32 v50, v8
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v9 :: v_dual_and_b32 v8, 0xffff0000, v53
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v31, v10
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v33, v8
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_mov_b32 v34, v11
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_and_b32 v8, 0xffff0000, v53
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v11 :: v_dual_mov_b32 v37, v12
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v36, v9
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v51
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v23
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v23
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v22
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v48
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v19
|
||||
@ -10213,7 +10212,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v34.h, v27.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v26
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v39
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v36
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v27.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v8
|
||||
@ -10227,7 +10226,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v31.h, v26.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v25
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v50
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v33
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v26.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v8
|
||||
@ -10238,7 +10237,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v8.h, v54.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v39.h, v25.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v36.h, v25.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v25.h, v54.h, vcc_lo
|
||||
@ -10250,7 +10249,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v8.h, v54.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v50.h, v24.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v33.h, v24.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v7
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v24.h, v54.h, vcc_lo
|
||||
@ -10262,39 +10261,41 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v54.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v7.h, v23.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v6
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v6
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v23.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v32.h, v54.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v6.h, v22.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v4
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v32.h, v54.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.h, v32.h, v54.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v6.h, v22.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.h, v32.h, v54.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v5.h, v21.h, s1
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v54.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v21.h, v54.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
|
||||
@ -10454,7 +10455,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v54.h, v28.h, s0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v26.l, v27.h, s2
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v39
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v36
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v27.h
|
||||
; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v54
|
||||
@ -10464,12 +10465,12 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v54.l
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v39.l, v25.l, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v36.l, v25.l, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v54.h, v27.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v25.l, v26.h, s2
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v50
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v33
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v24
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v26.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
@ -10480,7 +10481,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v54.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v50.l, v24.l, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v33.l, v24.l, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v54.h, v26.h, s0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
@ -10513,7 +10514,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v54.l
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v54.h, v7.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v54.h, v7.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v22.l, v6.h, s2
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v21
|
||||
@ -10529,13 +10530,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v21.l, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v54.l
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v54.h, v6.h, s0
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v54.h, v6.h, s0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.l, v5.h, s2
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v33
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v50
|
||||
; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v54
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v5.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
|
||||
@ -10551,7 +10552,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
|
||||
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v36
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v39
|
||||
; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v54
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v4.h, vcc_lo
|
||||
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
|
||||
@ -11201,21 +11202,20 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: scratch_load_b32 v55, off, s32
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v53, v15 :: v_dual_mov_b32 v48, v13
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v50, v8
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v31, v10 :: v_dual_and_b32 v8, 0xffff0000, v53
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v39, v9
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v31, v10
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v33, v8
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_mov_b32 v34, v11
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_and_b32 v8, 0xffff0000, v53
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v34, v11 :: v_dual_mov_b32 v37, v12
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v36, v9
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v51
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v23
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v23
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v22
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v48
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v19
|
||||
@ -11293,7 +11293,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v34.h, v27.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v26
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v39
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v36
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v27.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
||||
@ -11309,7 +11309,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v31.h, v26.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v25
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v50
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v33
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v26.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
||||
@ -11322,7 +11322,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v8.h, v54.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v39.h, v25.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v36.h, v25.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
@ -11337,7 +11337,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v8.h, v54.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v50.h, v24.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v33.h, v24.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v7
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
@ -11351,32 +11351,16 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v54.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v7.h, v23.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v6
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v6
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v23.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v32.h, v54.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v6.h, v22.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v4
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
@ -11384,20 +11368,36 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v32.h, v54.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v5.h, v21.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.h, v32.h, v54.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v6.h, v22.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v35.l, v54.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v4
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v21.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.h, v32.h, v54.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v5.h, v21.h, s1
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v35.l, v54.l
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v21.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v32
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
|
||||
@ -11581,7 +11581,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v54.h, v28.h, s0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v26.l, v27.h, s2
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v39
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v36
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v27.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v54
|
||||
@ -11593,13 +11593,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v54.l
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v39.l, v25.l, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v36.l, v25.l, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v54.h, v27.h, s0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v25.l, v26.h, s2
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v50
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v33
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v24
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v26.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v26, v54
|
||||
@ -11611,7 +11611,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v54.l
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v50.l, v24.l, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v33.l, v24.l, vcc_lo
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v54.h, v26.h, s0
|
||||
@ -11649,7 +11649,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v54.l
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v54.h, v7.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v54.h, v7.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v22.l, v6.h, s2
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v21
|
||||
@ -11668,12 +11668,12 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v54.l
|
||||
; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v54.h, v6.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v54.h, v6.h, s0
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.l, v5.h, s2
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v33
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v50
|
||||
; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v54
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v5.h, vcc_lo
|
||||
@ -11691,7 +11691,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
|
||||
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h
|
||||
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v36
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v39
|
||||
; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v54
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
||||
; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v4.h, vcc_lo
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user