[X86][AMX] Support AMX-TRANSPOSE (#113532)
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
This commit is contained in:
parent
1e19f0f9d9
commit
c72a751dab
@ -676,6 +676,7 @@ X86 Support
|
||||
- Supported intrinsics for ``MOVRS AND AVX10.2``.
|
||||
* Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``.
|
||||
- Support ISA of ``AMX-FP8``.
|
||||
- Support ISA of ``AMX-TRANSPOSE``.
|
||||
|
||||
Arm and AArch64 Support
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
@ -128,6 +128,11 @@ TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i",
|
||||
TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16")
|
||||
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
|
||||
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
|
||||
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
|
||||
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
|
||||
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
|
||||
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
|
||||
TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
|
||||
// AMX
|
||||
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
|
||||
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
|
||||
@ -148,6 +153,12 @@ TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite")
|
||||
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex")
|
||||
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex")
|
||||
|
||||
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0, "vIUcvC*z", "n", "amx-transpose")
|
||||
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose")
|
||||
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose")
|
||||
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose")
|
||||
TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose")
|
||||
|
||||
TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
|
||||
TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
|
||||
TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", "cmpccxadd")
|
||||
|
@ -6301,6 +6301,8 @@ def mamx_fp8 : Flag<["-"], "mamx-fp8">, Group<m_x86_Features_Group>;
|
||||
def mno_amx_fp8 : Flag<["-"], "mno-amx-fp8">, Group<m_x86_Features_Group>;
|
||||
def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
|
||||
def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
|
||||
def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
|
||||
def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
|
||||
def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
|
||||
def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>;
|
||||
def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
|
||||
|
@ -430,6 +430,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
|
||||
HasAMXCOMPLEX = true;
|
||||
} else if (Feature == "+amx-fp8") {
|
||||
HasAMXFP8 = true;
|
||||
} else if (Feature == "+amx-transpose") {
|
||||
HasAMXTRANSPOSE = true;
|
||||
} else if (Feature == "+cmpccxadd") {
|
||||
HasCMPCCXADD = true;
|
||||
} else if (Feature == "+raoint") {
|
||||
@ -951,6 +953,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
|
||||
Builder.defineMacro("__AMX_COMPLEX__");
|
||||
if (HasAMXFP8)
|
||||
Builder.defineMacro("__AMX_FP8__");
|
||||
if (HasAMXTRANSPOSE)
|
||||
Builder.defineMacro("__AMX_TRANSPOSE__");
|
||||
if (HasCMPCCXADD)
|
||||
Builder.defineMacro("__CMPCCXADD__");
|
||||
if (HasRAOINT)
|
||||
@ -1079,9 +1083,10 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
|
||||
.Case("amx-bf16", true)
|
||||
.Case("amx-complex", true)
|
||||
.Case("amx-fp16", true)
|
||||
.Case("amx-fp8", true)
|
||||
.Case("amx-int8", true)
|
||||
.Case("amx-tile", true)
|
||||
.Case("amx-fp8", true)
|
||||
.Case("amx-transpose", true)
|
||||
.Case("avx", true)
|
||||
.Case("avx10.1-256", true)
|
||||
.Case("avx10.1-512", true)
|
||||
@ -1198,9 +1203,10 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
|
||||
.Case("amx-bf16", HasAMXBF16)
|
||||
.Case("amx-complex", HasAMXCOMPLEX)
|
||||
.Case("amx-fp16", HasAMXFP16)
|
||||
.Case("amx-fp8", HasAMXFP8)
|
||||
.Case("amx-int8", HasAMXINT8)
|
||||
.Case("amx-tile", HasAMXTILE)
|
||||
.Case("amx-fp8", HasAMXFP8)
|
||||
.Case("amx-transpose", HasAMXTRANSPOSE)
|
||||
.Case("avx", SSELevel >= AVX)
|
||||
.Case("avx10.1-256", HasAVX10_1)
|
||||
.Case("avx10.1-512", HasAVX10_1_512)
|
||||
|
@ -158,6 +158,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
|
||||
bool HasAMXBF16 = false;
|
||||
bool HasAMXCOMPLEX = false;
|
||||
bool HasAMXFP8 = false;
|
||||
bool HasAMXTRANSPOSE = false;
|
||||
bool HasSERIALIZE = false;
|
||||
bool HasTSXLDTRK = false;
|
||||
bool HasUSERMSR = false;
|
||||
|
@ -16994,6 +16994,58 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
|
||||
// instruction, but it will create a memset that won't be optimized away.
|
||||
return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
|
||||
}
|
||||
// Corresponding to intrisics which will return 2 tiles (tile0_tile1).
|
||||
case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
|
||||
case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
|
||||
case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
|
||||
case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: {
|
||||
Intrinsic::ID IID;
|
||||
switch (BuiltinID) {
|
||||
default:
|
||||
llvm_unreachable("Unsupported intrinsic!");
|
||||
case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
|
||||
IID = Intrinsic::x86_t2rpntlvwz0_internal;
|
||||
break;
|
||||
case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
|
||||
IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
|
||||
break;
|
||||
case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
|
||||
IID = Intrinsic::x86_t2rpntlvwz1_internal;
|
||||
break;
|
||||
case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
|
||||
IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
|
||||
break;
|
||||
}
|
||||
|
||||
// Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
|
||||
Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
|
||||
{Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]});
|
||||
|
||||
auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>();
|
||||
assert(PtrTy && "arg3 must be of pointer type");
|
||||
QualType PtreeTy = PtrTy->getPointeeType();
|
||||
llvm::Type *TyPtee = ConvertType(PtreeTy);
|
||||
|
||||
// Bitcast amx type (x86_amx) to vector type (256 x i32)
|
||||
// Then store tile0 into DstPtr0
|
||||
Value *T0 = Builder.CreateExtractValue(Call, 0);
|
||||
Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
|
||||
{TyPtee}, {T0});
|
||||
Builder.CreateDefaultAlignedStore(VecT0, Ops[3]);
|
||||
|
||||
// Then store tile1 into DstPtr1
|
||||
Value *T1 = Builder.CreateExtractValue(Call, 1);
|
||||
Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
|
||||
{TyPtee}, {T1});
|
||||
Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);
|
||||
|
||||
// Note: Here we escape directly use x86_tilestored64_internal to store
|
||||
// the results due to it can't make sure the Mem written scope. This may
|
||||
// cause shapes reloads after first amx intrinsic, which current amx reg-
|
||||
// ister allocation has no ability to handle it.
|
||||
|
||||
return Store;
|
||||
}
|
||||
case X86::BI__ud2:
|
||||
// llvm.trap makes a ud2a instruction on x86.
|
||||
return EmitTrapCall(Intrinsic::trap);
|
||||
|
@ -148,8 +148,9 @@ set(x86_files
|
||||
ammintrin.h
|
||||
amxcomplexintrin.h
|
||||
amxfp16intrin.h
|
||||
amxintrin.h
|
||||
amxfp8intrin.h
|
||||
amxintrin.h
|
||||
amxtransposeintrin.h
|
||||
avx10_2_512bf16intrin.h
|
||||
avx10_2_512convertintrin.h
|
||||
avx10_2_512minmaxintrin.h
|
||||
|
@ -232,6 +232,8 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
||||
/// bytes. Since there is no 2D type in llvm IR, we use vector type to
|
||||
/// represent 2D tile and the fixed size is maximum amx tile register size.
|
||||
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
|
||||
typedef int _tile1024i_1024a
|
||||
__attribute__((__vector_size__(1024), __aligned__(1024)));
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||
|
248
clang/lib/Headers/amxtransposeintrin.h
Normal file
248
clang/lib/Headers/amxtransposeintrin.h
Normal file
@ -0,0 +1,248 @@
|
||||
/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
* ===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
|
||||
#endif /* __IMMINTRIN_H */
|
||||
|
||||
#ifndef __AMX_TRANSPOSEINTRIN_H
|
||||
#define __AMX_TRANSPOSEINTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
#define __DEFAULT_FN_ATTRS_TRANSPOSE \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
|
||||
|
||||
#define _tile_2rpntlvwz0(tdst, base, stride) \
|
||||
__builtin_ia32_t2rpntlvwz0(tdst, base, stride)
|
||||
#define _tile_2rpntlvwz0t1(tdst, base, stride) \
|
||||
__builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
|
||||
#define _tile_2rpntlvwz1(tdst, base, stride) \
|
||||
__builtin_ia32_t2rpntlvwz1(tdst, base, stride)
|
||||
#define _tile_2rpntlvwz1t1(tdst, base, stride) \
|
||||
__builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
|
||||
|
||||
/// Transpose 32-bit elements from \a src and write the result to \a dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void _tile_transposed(__tile dst, __tile src);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src
|
||||
/// The source tile. Max size is 1024 Bytes.
|
||||
///
|
||||
/// \code{.operation}
|
||||
///
|
||||
/// FOR i := 0 TO (dst.rows-1)
|
||||
/// tmp[511:0] := 0
|
||||
/// FOR j := 0 TO (dst.colsb/4-1)
|
||||
/// tmp.dword[j] := src.row[j].dword[i]
|
||||
/// ENDFOR
|
||||
/// dst.row[i] := tmp
|
||||
/// ENDFOR
|
||||
///
|
||||
/// zero_upper_rows(dst, dst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
|
||||
unsigned short row, unsigned short col0, unsigned short col1,
|
||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
// Use __tile1024i_1024a* to escape the alignment check in
|
||||
// clang/test/Headers/x86-intrinsics-headers-clean.cpp
|
||||
__builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
|
||||
(_tile1024i_1024a *)dst1, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
|
||||
unsigned short row, unsigned short col0, unsigned short col1,
|
||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
__builtin_ia32_t2rpntlvwz0t1_internal(
|
||||
row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
|
||||
unsigned short row, unsigned short col0, unsigned short col1,
|
||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
__builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
|
||||
(_tile1024i_1024a *)dst1, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
|
||||
unsigned short row, unsigned short col0, unsigned short col1,
|
||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
__builtin_ia32_t2rpntlvwz1t1_internal(
|
||||
row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
|
||||
_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
|
||||
return __builtin_ia32_ttransposed_internal(m, n, src);
|
||||
}
|
||||
|
||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
||||
/// in memory is specified via a tsib; the second tile is after the first
|
||||
/// one, separated by the same stride that separates each row.
|
||||
/// The tile configuration for the destination tiles indicates the amount
|
||||
/// of data to read from memory. The instruction will load a number of rows
|
||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
||||
/// is equal to the average width of the destination tiles. If the second
|
||||
/// tile is configured with zero rows and columns, only the first tile will
|
||||
/// be written.
|
||||
/// Provides a hint to the implementation that the data will likely not be
|
||||
/// reused in the near future and the data caching can be optimized.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction.
|
||||
///
|
||||
/// \param dst0
|
||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param dst1
|
||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
||||
static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
|
||||
const void *base, __SIZE_TYPE__ stride) {
|
||||
_tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
||||
&dst1->tile, base, stride);
|
||||
}
|
||||
|
||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
||||
/// in memory is specified via a tsib; the second tile is after the first
|
||||
/// one, separated by the same stride that separates each row.
|
||||
/// The tile configuration for the destination tiles indicates the amount
|
||||
/// of data to read from memory. The instruction will load a number of rows
|
||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
||||
/// is equal to the average width of the destination tiles. If the second
|
||||
/// tile is configured with zero rows and columns, only the first tile will
|
||||
/// be written.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction.
|
||||
///
|
||||
/// \param dst0
|
||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param dst1
|
||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
||||
static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
|
||||
const void *base, __SIZE_TYPE__ stride) {
|
||||
_tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
||||
&dst1->tile, base, stride);
|
||||
}
|
||||
|
||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
||||
/// in memory is specified via a tsib; the second tile is after the first
|
||||
/// one, separated by the same stride that separates each row.
|
||||
/// The tile configuration for the destination tiles indicates the amount
|
||||
/// of data to read from memory. The instruction will load a number of rows
|
||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
||||
/// is equal to the average width of the destination tiles. If the second
|
||||
/// tile is configured with zero rows and columns, only the first tile will
|
||||
/// be written. The last row will be not be read from memory but instead
|
||||
/// filled with zeros.
|
||||
/// Provides a hint to the implementation that the data will likely not be
|
||||
/// reused in the near future and the data caching can be optimized.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
|
||||
///
|
||||
/// \param dst0
|
||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param dst1
|
||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
||||
static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
|
||||
const void *base, __SIZE_TYPE__ stride) {
|
||||
_tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
||||
&dst1->tile, base, stride);
|
||||
}
|
||||
|
||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
||||
/// in memory is specified via a tsib; the second tile is after the first
|
||||
/// one, separated by the same stride that separates each row.
|
||||
/// The tile configuration for the destination tiles indicates the amount
|
||||
/// of data to read from memory. The instruction will load a number of rows
|
||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
||||
/// is equal to the average width of the destination tiles. If the second
|
||||
/// tile is configured with zero rows and columns, only the first tile will
|
||||
/// be written. The last row will be not be read from memory but instead
|
||||
/// filled with zeros.
|
||||
/// Provides a hint to the implementation that the data will likely not be
|
||||
/// reused in the near future and the data caching can be optimized.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction.
|
||||
///
|
||||
/// \param dst0
|
||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param dst1
|
||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
||||
static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
|
||||
const void *base, __SIZE_TYPE__ stride) {
|
||||
_tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
||||
&dst1->tile, base, stride);
|
||||
}
|
||||
|
||||
/// Transpose 32-bit elements from src and write the result to dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src
|
||||
/// The source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
||||
static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
|
||||
dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);
|
||||
}
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMX_TRANSPOSEINTRIN_H */
|
@ -652,6 +652,10 @@ _storebe_i64(void * __P, long long __D) {
|
||||
#include <amxfp8intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TRANSPOSE__)
|
||||
#include <amxtransposeintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || \
|
||||
defined(__AVX512VP2INTERSECT__)
|
||||
#include <avx512vp2intersectintrin.h>
|
||||
|
@ -631,6 +631,10 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
|
||||
case X86::BI__builtin_ia32_tileloaddt164:
|
||||
case X86::BI__builtin_ia32_tilestored64:
|
||||
case X86::BI__builtin_ia32_tilezero:
|
||||
case X86::BI__builtin_ia32_t2rpntlvwz0:
|
||||
case X86::BI__builtin_ia32_t2rpntlvwz0t1:
|
||||
case X86::BI__builtin_ia32_t2rpntlvwz1:
|
||||
case X86::BI__builtin_ia32_t2rpntlvwz1t1:
|
||||
return CheckBuiltinTileArgumentsRange(TheCall, 0);
|
||||
case X86::BI__builtin_ia32_tdpbssd:
|
||||
case X86::BI__builtin_ia32_tdpbsud:
|
||||
@ -645,6 +649,8 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
|
||||
case X86::BI__builtin_ia32_tdphbf8ps:
|
||||
case X86::BI__builtin_ia32_tdphf8ps:
|
||||
return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
|
||||
case X86::BI__builtin_ia32_ttransposed:
|
||||
return CheckBuiltinTileArgumentsRange(TheCall, {0, 1});
|
||||
}
|
||||
}
|
||||
static bool isX86_32Builtin(unsigned BuiltinID) {
|
||||
|
36
clang/test/CodeGen/X86/amx_transpose.c
Normal file
36
clang/test/CodeGen/X86/amx_transpose.c
Normal file
@ -0,0 +1,36 @@
|
||||
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-transpose \
|
||||
// RUN: -target-feature +avx512f -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression| FileCheck %s
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stddef.h>
|
||||
|
||||
void test_tile_2rpntlvwz0(const void *A, size_t B) {
|
||||
// CHECK-LABEL: @test_tile_2rpntlvwz0
|
||||
// CHECK: call void @llvm.x86.t2rpntlvwz0(i8 1, ptr %{{.*}}, i64 %{{.*}})
|
||||
_tile_2rpntlvwz0(1, A, B);
|
||||
}
|
||||
|
||||
void test_tile_2rpntlvwz0t1(const void *A, size_t B) {
|
||||
// CHECK-LABEL: @test_tile_2rpntlvwz0t1
|
||||
// CHECK: call void @llvm.x86.t2rpntlvwz0t1(i8 1, ptr %{{.*}}, i64 %{{.*}})
|
||||
_tile_2rpntlvwz0t1(1, A, B);
|
||||
}
|
||||
|
||||
void test_tile_2rpntlvwz1(const void *A, size_t B) {
|
||||
// CHECK-LABEL: @test_tile_2rpntlvwz1
|
||||
// CHECK: call void @llvm.x86.t2rpntlvwz1(i8 1, ptr %{{.*}}, i64 %{{.*}})
|
||||
_tile_2rpntlvwz1(1, A, B);
|
||||
}
|
||||
|
||||
void test_tile_2rpntlvwz1t1(const void *A, size_t B) {
|
||||
// CHECK-LABEL: @test_tile_2rpntlvwz1t1
|
||||
// CHECK: call void @llvm.x86.t2rpntlvwz1t1(i8 1, ptr %{{.*}}, i64 %{{.*}})
|
||||
_tile_2rpntlvwz1t1(1, A, B);
|
||||
}
|
||||
|
||||
void test_tile_transposed(void)
|
||||
{
|
||||
// CHECK-LABEL: @test_tile_transposed
|
||||
// CHECK: call void @llvm.x86.ttransposed(i8 1, i8 2)
|
||||
_tile_transposed(1, 2);
|
||||
}
|
66
clang/test/CodeGen/X86/amx_transpose_api.c
Normal file
66
clang/test/CodeGen/X86/amx_transpose_api.c
Normal file
@ -0,0 +1,66 @@
|
||||
// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f \
|
||||
// RUN: -target-feature +amx-transpose \
|
||||
// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
char buf[2048];
|
||||
#define STRIDE 32
|
||||
|
||||
char buf2[2048];
|
||||
|
||||
void test_tile_2rpntlvwz0(__tile1024i dst0, __tile1024i dst1) {
|
||||
//CHECK-LABEL: @test_tile_2rpntlvwz0
|
||||
//CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal
|
||||
//CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
|
||||
//CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
||||
//CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
|
||||
//CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
|
||||
//CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
||||
//CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
|
||||
__tile_2rpntlvwz0(&dst0, &dst1, buf, STRIDE);
|
||||
}
|
||||
|
||||
void test_tile_2rpntlvwz0t1(__tile1024i dst0, __tile1024i dst1) {
|
||||
//CHECK-LABEL: @test_tile_2rpntlvwz0t1
|
||||
//CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal
|
||||
//CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
|
||||
//CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
||||
//CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
|
||||
//CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
|
||||
//CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
||||
//CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
|
||||
__tile_2rpntlvwz0t1(&dst0, &dst1, buf, STRIDE);
|
||||
}
|
||||
|
||||
void test_tile_2rpntlvwz1(__tile1024i dst0, __tile1024i dst1) {
|
||||
//CHECK-LABEL: @test_tile_2rpntlvwz1
|
||||
//CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal
|
||||
//CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
|
||||
//CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
||||
//CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
|
||||
//CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
|
||||
//CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
||||
//CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
|
||||
__tile_2rpntlvwz1(&dst0, &dst1, buf, STRIDE);
|
||||
}
|
||||
|
||||
void test_tile_2rpntlvwz1t1(__tile1024i dst0, __tile1024i dst1) {
|
||||
//CHECK-LABEL: @test_tile_2rpntlvwz1t1
|
||||
//CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal
|
||||
//CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
|
||||
//CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
||||
//CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
|
||||
//CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
|
||||
//CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
||||
//CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
|
||||
__tile_2rpntlvwz1t1(&dst0, &dst1, buf, STRIDE);
|
||||
}
|
||||
|
||||
void test_tile_transposed(__tile1024i dst, __tile1024i src) {
|
||||
//CHECK-LABEL: @test_tile_transposed
|
||||
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
|
||||
//CHECK-DAG: call x86_amx @llvm.x86.ttransposed.internal
|
||||
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
||||
__tile_transposed(&dst, src);
|
||||
}
|
31
clang/test/CodeGen/X86/amx_transpose_errors.c
Normal file
31
clang/test/CodeGen/X86/amx_transpose_errors.c
Normal file
@ -0,0 +1,31 @@
|
||||
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
|
||||
// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-transpose \
|
||||
// RUN: -target-feature +avx512f -target-feature +amx-element-evex -verify
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stddef.h>
|
||||
#include <immintrin.h>
|
||||
#include <stddef.h>
|
||||
|
||||
// Transpose
|
||||
void test_tile_2rpntlvwz0(const void *A, size_t B) {
|
||||
_tile_2rpntlvwz0(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
|
||||
}
|
||||
|
||||
void test_tile_2rpntlvwz0t1(const void *A, size_t B) {
|
||||
_tile_2rpntlvwz0t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
|
||||
}
|
||||
|
||||
void test_tile_2rpntlvwz1(const void *A, size_t B) {
|
||||
_tile_2rpntlvwz1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
|
||||
}
|
||||
|
||||
void test_tile_2rpntlvwz1t1(const void *A, size_t B) {
|
||||
_tile_2rpntlvwz1t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
|
||||
}
|
||||
|
||||
void test_tile_transposed()
|
||||
{
|
||||
_tile_transposed(8, 2); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
|
||||
_tile_transposed(1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
|
||||
}
|
@ -304,6 +304,13 @@
|
||||
// AMX-COMPLEX: "-target-feature" "+amx-complex"
|
||||
// NO-AMX-COMPLEX: "-target-feature" "-amx-complex"
|
||||
|
||||
// RUN: %clang -target x86_64-unknown-linux-gnu -mamx-transpose %s \
|
||||
// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-TRANSPOSE %s
|
||||
// RUN: %clang -target x86_64-unknown-linux-gnu -mno-amx-transpose %s \
|
||||
// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s
|
||||
// AMX-TRANSPOSE: "-target-feature" "+amx-transpose"
|
||||
// NO-AMX-TRANSPOSE: "-target-feature" "-amx-transpose"
|
||||
|
||||
// RUN: %clang --target=i386 -march=i386 -mhreset %s -### 2>&1 | FileCheck -check-prefix=HRESET %s
|
||||
// RUN: %clang --target=i386 -march=i386 -mno-hreset %s -### 2>&1 | FileCheck -check-prefix=NO-HRESET %s
|
||||
// HRESET: "-target-feature" "+hreset"
|
||||
|
@ -546,6 +546,18 @@
|
||||
|
||||
// NO-AMX-COMPLEX-NOT: #define __AMX_COMPLEX__ 1
|
||||
|
||||
// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -x c \
|
||||
// RUN: -E -dM -o - %s | FileCheck -check-prefix=AMX-TRANSPOSE %s
|
||||
|
||||
// AMX-TRANSPOSE: #define __AMX_TRANSPOSE__ 1
|
||||
|
||||
// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-amx-transpose -x c \
|
||||
// RUN: -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s
|
||||
// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -mno-amx-tile \
|
||||
// RUN: -x c -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s
|
||||
|
||||
// NO-AMX-TRANSPOSE-NOT: #define __AMX_TRANSPOSE__ 1
|
||||
|
||||
// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNI %s
|
||||
|
||||
// AVXVNNI: #define __AVX2__ 1
|
||||
|
@ -34,9 +34,31 @@ public:
|
||||
if (MRI)
|
||||
deduceImm(MRI);
|
||||
}
|
||||
// When ShapeT has multiple shapes, we only use Shapes (never use Row and Col)
|
||||
// and ImmShapes. Due to the most case is only one shape (just simply use
|
||||
// Shape.Row or Shape.Col), so here we don't merge Row and Col into vector
|
||||
// Shapes to keep the speed and code simplicity.
|
||||
// TODO: The upper solution is a temporary way to minimize current tile
|
||||
// register allocation code changes. It can not handle both Reg shape and
|
||||
// Imm shape for different shapes (e.g. shape 1 is reg shape while shape 2
|
||||
// is imm shape). Refine me when we have more multi-tile shape instructions!
|
||||
ShapeT(ArrayRef<MachineOperand *> ShapesOperands,
|
||||
const MachineRegisterInfo *MRI = nullptr)
|
||||
: Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
|
||||
ColImm(InvalidImmShape) {
|
||||
assert(ShapesOperands.size() % 2 == 0 && "Miss row or col!");
|
||||
|
||||
for (auto *Shape : ShapesOperands)
|
||||
Shapes.push_back(Shape);
|
||||
|
||||
if (MRI)
|
||||
deduceImm(MRI);
|
||||
}
|
||||
ShapeT()
|
||||
: Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
|
||||
ColImm(InvalidImmShape) {}
|
||||
// TODO: We need to extern cmp operator for multi-shapes if
|
||||
// we have requirement in the future.
|
||||
bool operator==(const ShapeT &Shape) const {
|
||||
MachineOperand *R = Shape.Row;
|
||||
MachineOperand *C = Shape.Col;
|
||||
@ -53,13 +75,40 @@ public:
|
||||
|
||||
bool operator!=(const ShapeT &Shape) const { return !(*this == Shape); }
|
||||
|
||||
MachineOperand *getRow() const { return Row; }
|
||||
MachineOperand *getRow(unsigned I = 0) const {
|
||||
if (Shapes.empty())
|
||||
return Row;
|
||||
assert(Shapes.size() / 2 >= I && "Get invalid row from id!");
|
||||
return Shapes[I * 2];
|
||||
}
|
||||
|
||||
MachineOperand *getCol() const { return Col; }
|
||||
MachineOperand *getCol(unsigned I = 0) const {
|
||||
if (Shapes.empty())
|
||||
return Col;
|
||||
assert(Shapes.size() / 2 >= I && "Get invalid col from id!");
|
||||
return Shapes[I * 2 + 1];
|
||||
}
|
||||
|
||||
int64_t getRowImm() const { return RowImm; }
|
||||
int64_t getRowImm(unsigned I = 0) const {
|
||||
if (ImmShapes.empty())
|
||||
return RowImm;
|
||||
assert(ImmShapes.size() / 2 >= I && "Get invalid imm row from id!");
|
||||
return ImmShapes[I * 2];
|
||||
}
|
||||
|
||||
int64_t getColImm() const { return ColImm; }
|
||||
int64_t getColImm(unsigned I = 0) const {
|
||||
if (ImmShapes.empty())
|
||||
return ColImm;
|
||||
assert(ImmShapes.size() / 2 >= I && "Get invalid imm col from id!");
|
||||
return ImmShapes[I * 2 + 1];
|
||||
}
|
||||
|
||||
unsigned getShapeNum() {
|
||||
if (Shapes.empty())
|
||||
return isValid() ? 1 : 0;
|
||||
else
|
||||
return Shapes.size() / 2;
|
||||
}
|
||||
|
||||
bool isValid() { return (Row != nullptr) && (Col != nullptr); }
|
||||
|
||||
@ -72,14 +121,35 @@ public:
|
||||
for (const MachineOperand &DefMO : MRI->def_operands(Reg)) {
|
||||
const auto *MI = DefMO.getParent();
|
||||
if (MI->isMoveImmediate()) {
|
||||
Imm = MI->getOperand(1).getImm();
|
||||
assert(MI->getNumOperands() == 2 &&
|
||||
"Unsupported number of operands in instruction for setting "
|
||||
"row/column.");
|
||||
if (MI->getOperand(1).isImm()) {
|
||||
Imm = MI->getOperand(1).getImm();
|
||||
} else {
|
||||
assert(MI->getOperand(1).isImplicit() &&
|
||||
"Operand 1 is assumed to be implicit.");
|
||||
Imm = 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Imm;
|
||||
};
|
||||
RowImm = GetImm(Row->getReg());
|
||||
ColImm = GetImm(Col->getReg());
|
||||
if (Shapes.empty()) { // Single Shape
|
||||
RowImm = GetImm(Row->getReg());
|
||||
ColImm = GetImm(Col->getReg());
|
||||
// The number of rows of 2nd destination buffer is assigned by the one of
|
||||
// 1st destination buffer. If the column size is equal to zero, the row
|
||||
// size should be reset to zero too.
|
||||
if (ColImm == 0)
|
||||
Row = Col;
|
||||
} else { // Multiple Shapes
|
||||
for (auto *Shape : Shapes) {
|
||||
int64_t ImmShape = GetImm(Shape->getReg());
|
||||
ImmShapes.push_back(ImmShape);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
@ -88,6 +158,9 @@ private:
|
||||
MachineOperand *Col;
|
||||
int64_t RowImm = -1;
|
||||
int64_t ColImm = -1;
|
||||
// Multiple Shapes
|
||||
SmallVector<MachineOperand *, 0> Shapes;
|
||||
SmallVector<int64_t, 0> ImmShapes;
|
||||
};
|
||||
|
||||
} // namespace llvm
|
||||
|
@ -5917,6 +5917,41 @@ let TargetPrefix = "x86" in {
|
||||
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
|
||||
ImmArg<ArgIndex<2>>]>;
|
||||
|
||||
// AMX-FP8
|
||||
def int_x86_tdpbf8ps : ClangBuiltin<"__builtin_ia32_tdpbf8ps">,
|
||||
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
|
||||
[ImmArg<ArgIndex<0>>,
|
||||
ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
|
||||
def int_x86_tdpbhf8ps : ClangBuiltin<"__builtin_ia32_tdpbhf8ps">,
|
||||
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
|
||||
[ImmArg<ArgIndex<0>>,
|
||||
ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
|
||||
def int_x86_tdphbf8ps : ClangBuiltin<"__builtin_ia32_tdphbf8ps">,
|
||||
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
|
||||
[ImmArg<ArgIndex<0>>,
|
||||
ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
|
||||
def int_x86_tdphf8ps : ClangBuiltin<"__builtin_ia32_tdphf8ps">,
|
||||
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
|
||||
[ImmArg<ArgIndex<0>>,
|
||||
ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
|
||||
|
||||
// AMX-TRANSPOSE
|
||||
def int_x86_t2rpntlvwz0 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0">,
|
||||
Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
|
||||
[ImmArg<ArgIndex<0>>]>;
|
||||
def int_x86_t2rpntlvwz0t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0t1">,
|
||||
Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
|
||||
[ImmArg<ArgIndex<0>>]>;
|
||||
def int_x86_t2rpntlvwz1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1">,
|
||||
Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
|
||||
[ImmArg<ArgIndex<0>>]>;
|
||||
def int_x86_t2rpntlvwz1t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1t1">,
|
||||
Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
|
||||
[ImmArg<ArgIndex<0>>]>;
|
||||
def int_x86_ttransposed : ClangBuiltin<"__builtin_ia32_ttransposed">,
|
||||
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty],
|
||||
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
|
||||
|
||||
// AMX - internal intrinsics
|
||||
def int_x86_ldtilecfg_internal :
|
||||
ClangBuiltin<"__builtin_ia32_tile_loadconfig_internal">,
|
||||
@ -5995,22 +6030,26 @@ let TargetPrefix = "x86" in {
|
||||
llvm_x86amx_ty, llvm_x86amx_ty,
|
||||
llvm_x86amx_ty], []>;
|
||||
|
||||
def int_x86_tdpbf8ps : ClangBuiltin<"__builtin_ia32_tdpbf8ps">,
|
||||
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
|
||||
[ImmArg<ArgIndex<0>>,
|
||||
ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
|
||||
def int_x86_tdpbhf8ps : ClangBuiltin<"__builtin_ia32_tdpbhf8ps">,
|
||||
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
|
||||
[ImmArg<ArgIndex<0>>,
|
||||
ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
|
||||
def int_x86_tdphbf8ps : ClangBuiltin<"__builtin_ia32_tdphbf8ps">,
|
||||
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
|
||||
[ImmArg<ArgIndex<0>>,
|
||||
ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
|
||||
def int_x86_tdphf8ps : ClangBuiltin<"__builtin_ia32_tdphf8ps">,
|
||||
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
|
||||
[ImmArg<ArgIndex<0>>,
|
||||
ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
|
||||
def int_x86_t2rpntlvwz0_internal :
|
||||
Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
|
||||
[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
|
||||
[]>;
|
||||
def int_x86_t2rpntlvwz0t1_internal :
|
||||
Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
|
||||
[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
|
||||
[]>;
|
||||
def int_x86_t2rpntlvwz1_internal :
|
||||
Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
|
||||
[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
|
||||
[]>;
|
||||
def int_x86_t2rpntlvwz1t1_internal :
|
||||
Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
|
||||
[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
|
||||
[]>;
|
||||
def int_x86_ttransposed_internal :
|
||||
ClangBuiltin<"__builtin_ia32_ttransposed_internal">,
|
||||
Intrinsic<[llvm_x86amx_ty],
|
||||
[llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -511,6 +511,7 @@ enum OperandEncoding { ENCODINGS ENCODING_max };
|
||||
ENUM_ENTRY(TYPE_VK, "mask register") \
|
||||
ENUM_ENTRY(TYPE_VK_PAIR, "mask register pair") \
|
||||
ENUM_ENTRY(TYPE_TMM, "tile") \
|
||||
ENUM_ENTRY(TYPE_TMM_PAIR, "tile pair") \
|
||||
ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \
|
||||
ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \
|
||||
ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \
|
||||
|
@ -265,6 +265,7 @@ X86_FEATURE_COMPAT(AVX10_2_512, "avx10.2-512", 0)
|
||||
X86_FEATURE (MOVRS, "movrs")
|
||||
X86_FEATURE (ZU, "zu")
|
||||
X86_FEATURE (AMX_FP8, "amx-fp8")
|
||||
X86_FEATURE (AMX_TRANSPOSE, "amx-transpose")
|
||||
// These features aren't really CPU features, but the frontend can set them.
|
||||
X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk")
|
||||
X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
|
||||
|
@ -623,6 +623,37 @@ struct X86Operand final : public MCParsedAsmOperand {
|
||||
Inst.addOperand(MCOperand::createReg(Reg));
|
||||
}
|
||||
|
||||
bool isTILEPair() const {
|
||||
return Kind == Register &&
|
||||
X86MCRegisterClasses[X86::TILERegClassID].contains(getReg());
|
||||
}
|
||||
|
||||
void addTILEPairOperands(MCInst &Inst, unsigned N) const {
|
||||
assert(N == 1 && "Invalid number of operands!");
|
||||
unsigned Reg = getReg();
|
||||
switch (Reg) {
|
||||
default:
|
||||
llvm_unreachable("Invalid tile register!");
|
||||
case X86::TMM0:
|
||||
case X86::TMM1:
|
||||
Reg = X86::TMM0_TMM1;
|
||||
break;
|
||||
case X86::TMM2:
|
||||
case X86::TMM3:
|
||||
Reg = X86::TMM2_TMM3;
|
||||
break;
|
||||
case X86::TMM4:
|
||||
case X86::TMM5:
|
||||
Reg = X86::TMM4_TMM5;
|
||||
break;
|
||||
case X86::TMM6:
|
||||
case X86::TMM7:
|
||||
Reg = X86::TMM6_TMM7;
|
||||
break;
|
||||
}
|
||||
Inst.addOperand(MCOperand::createReg(Reg));
|
||||
}
|
||||
|
||||
void addMemOperands(MCInst &Inst, unsigned N) const {
|
||||
assert((N == 5) && "Invalid number of operands!");
|
||||
if (getMemBaseReg())
|
||||
|
@ -806,6 +806,10 @@ static int readModRM(struct InternalInstruction *insn) {
|
||||
if (index > 7) \
|
||||
*valid = 0; \
|
||||
return prefix##_TMM0 + index; \
|
||||
case TYPE_TMM_PAIR: \
|
||||
if (index > 7) \
|
||||
*valid = 0; \
|
||||
return prefix##_TMM0_TMM1 + (index / 2); \
|
||||
case TYPE_VK: \
|
||||
index &= 0xf; \
|
||||
if (index > 7) \
|
||||
@ -2315,6 +2319,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
|
||||
case TYPE_YMM:
|
||||
case TYPE_ZMM:
|
||||
case TYPE_TMM:
|
||||
case TYPE_TMM_PAIR:
|
||||
case TYPE_VK_PAIR:
|
||||
case TYPE_VK:
|
||||
case TYPE_DEBUGREG:
|
||||
|
@ -535,6 +535,12 @@ namespace X86Disassembler {
|
||||
ENTRY(TMM6) \
|
||||
ENTRY(TMM7)
|
||||
|
||||
#define REGS_TMM_PAIRS \
|
||||
ENTRY(TMM0_TMM1) \
|
||||
ENTRY(TMM2_TMM3) \
|
||||
ENTRY(TMM4_TMM5) \
|
||||
ENTRY(TMM6_TMM7)
|
||||
|
||||
#define ALL_EA_BASES \
|
||||
EA_BASES_16BIT \
|
||||
EA_BASES_32BIT \
|
||||
@ -559,6 +565,7 @@ namespace X86Disassembler {
|
||||
REGS_DEBUG \
|
||||
REGS_CONTROL \
|
||||
REGS_TMM \
|
||||
REGS_TMM_PAIRS \
|
||||
ENTRY(RIP)
|
||||
|
||||
/// All possible values of the base field for effective-address
|
||||
|
@ -463,3 +463,22 @@ void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
|
||||
}
|
||||
llvm_unreachable("Unknown mask pair register name");
|
||||
}
|
||||
|
||||
void X86InstPrinterCommon::printTILEPair(const MCInst *MI, unsigned OpNo,
|
||||
raw_ostream &OS) {
|
||||
switch (MI->getOperand(OpNo).getReg()) {
|
||||
case X86::TMM0_TMM1:
|
||||
printRegName(OS, X86::TMM0);
|
||||
return;
|
||||
case X86::TMM2_TMM3:
|
||||
printRegName(OS, X86::TMM2);
|
||||
return;
|
||||
case X86::TMM4_TMM5:
|
||||
printRegName(OS, X86::TMM4);
|
||||
return;
|
||||
case X86::TMM6_TMM7:
|
||||
printRegName(OS, X86::TMM6);
|
||||
return;
|
||||
}
|
||||
llvm_unreachable("Unknown mask pair register name");
|
||||
}
|
||||
|
@ -38,6 +38,7 @@ protected:
|
||||
const MCSubtargetInfo &STI);
|
||||
void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
|
||||
void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
|
||||
void printTILEPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
|
||||
};
|
||||
|
||||
} // end namespace llvm
|
||||
|
@ -273,6 +273,9 @@ def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true",
|
||||
def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true",
|
||||
"Support AMX-FP8 instructions",
|
||||
[FeatureAMXTILE]>;
|
||||
def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true",
|
||||
"Support AMX amx-transpose instructions",
|
||||
[FeatureAMXTILE]>;
|
||||
def FeatureCMPCCXADD : SubtargetFeature<"cmpccxadd", "HasCMPCCXADD", "true",
|
||||
"Support CMPCCXADD instructions">;
|
||||
def FeatureRAOINT : SubtargetFeature<"raoint", "HasRAOINT", "true",
|
||||
|
@ -568,6 +568,131 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
|
||||
MI.setDesc(TII->get(Opc));
|
||||
return true;
|
||||
}
|
||||
// TILEPAIRLOAD is just for TILEPair spill, we don't have corresponding
|
||||
// AMX instruction to support it. So, split it to 2 load instructions:
|
||||
// "TILEPAIRLOAD TMM0:TMM1, Base, Scale, Index, Offset, Segment" -->
|
||||
// "TILELOAD TMM0, Base, Scale, Index, Offset, Segment" +
|
||||
// "TILELOAD TMM1, Base, Scale, Index, Offset + TMM_SIZE, Segment"
|
||||
case X86::PTILEPAIRLOAD: {
|
||||
int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm();
|
||||
Register TReg = MBBI->getOperand(0).getReg();
|
||||
bool DstIsDead = MBBI->getOperand(0).isDead();
|
||||
Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0);
|
||||
Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1);
|
||||
unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8;
|
||||
|
||||
MachineInstrBuilder MIBLo =
|
||||
BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD))
|
||||
.addReg(TReg0, RegState::Define | getDeadRegState(DstIsDead));
|
||||
MachineInstrBuilder MIBHi =
|
||||
BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD))
|
||||
.addReg(TReg1, RegState::Define | getDeadRegState(DstIsDead));
|
||||
|
||||
for (int i = 0; i < X86::AddrNumOperands; ++i) {
|
||||
MIBLo.add(MBBI->getOperand(1 + i));
|
||||
if (i == X86::AddrDisp)
|
||||
MIBHi.addImm(Disp + TmmSize);
|
||||
else
|
||||
MIBHi.add(MBBI->getOperand(1 + i));
|
||||
}
|
||||
|
||||
// Make sure the first stride reg used in first tileload is alive.
|
||||
MachineOperand &Stride =
|
||||
MIBLo.getInstr()->getOperand(1 + X86::AddrIndexReg);
|
||||
Stride.setIsKill(false);
|
||||
|
||||
// Split the memory operand, adjusting the offset and size for the halves.
|
||||
MachineMemOperand *OldMMO = MBBI->memoperands().front();
|
||||
MachineFunction *MF = MBB.getParent();
|
||||
MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize);
|
||||
MachineMemOperand *MMOHi =
|
||||
MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize);
|
||||
|
||||
MIBLo.setMemRefs(MMOLo);
|
||||
MIBHi.setMemRefs(MMOHi);
|
||||
|
||||
// Delete the pseudo.
|
||||
MBB.erase(MBBI);
|
||||
return true;
|
||||
}
|
||||
// Similar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no
|
||||
// corresponding AMX instruction to support it. So, split it too:
|
||||
// "TILEPAIRSTORE Base, Scale, Index, Offset, Segment, TMM0:TMM1" -->
|
||||
// "TILESTORE Base, Scale, Index, Offset, Segment, TMM0" +
|
||||
// "TILESTORE Base, Scale, Index, Offset + TMM_SIZE, Segment, TMM1"
|
||||
case X86::PTILEPAIRSTORE: {
|
||||
int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm();
|
||||
Register TReg = MBBI->getOperand(X86::AddrNumOperands).getReg();
|
||||
bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill();
|
||||
Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0);
|
||||
Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1);
|
||||
unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8;
|
||||
|
||||
MachineInstrBuilder MIBLo =
|
||||
BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED));
|
||||
MachineInstrBuilder MIBHi =
|
||||
BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED));
|
||||
|
||||
for (int i = 0; i < X86::AddrNumOperands; ++i) {
|
||||
MIBLo.add(MBBI->getOperand(i));
|
||||
if (i == X86::AddrDisp)
|
||||
MIBHi.addImm(Disp + TmmSize);
|
||||
else
|
||||
MIBHi.add(MBBI->getOperand(i));
|
||||
}
|
||||
MIBLo.addReg(TReg0, getKillRegState(SrcIsKill));
|
||||
MIBHi.addReg(TReg1, getKillRegState(SrcIsKill));
|
||||
|
||||
// Make sure the first stride reg used in first tilestore is alive.
|
||||
MachineOperand &Stride = MIBLo.getInstr()->getOperand(X86::AddrIndexReg);
|
||||
Stride.setIsKill(false);
|
||||
|
||||
// Split the memory operand, adjusting the offset and size for the halves.
|
||||
MachineMemOperand *OldMMO = MBBI->memoperands().front();
|
||||
MachineFunction *MF = MBB.getParent();
|
||||
MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize);
|
||||
MachineMemOperand *MMOHi =
|
||||
MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize);
|
||||
|
||||
MIBLo.setMemRefs(MMOLo);
|
||||
MIBHi.setMemRefs(MMOHi);
|
||||
|
||||
// Delete the pseudo.
|
||||
MBB.erase(MBBI);
|
||||
return true;
|
||||
}
|
||||
case X86::PT2RPNTLVWZ0V:
|
||||
case X86::PT2RPNTLVWZ0T1V:
|
||||
case X86::PT2RPNTLVWZ1V:
|
||||
case X86::PT2RPNTLVWZ1T1V: {
|
||||
for (unsigned i = 3; i > 0; --i)
|
||||
MI.removeOperand(i);
|
||||
unsigned Opc;
|
||||
switch (Opcode) {
|
||||
case X86::PT2RPNTLVWZ0V:
|
||||
Opc = X86::T2RPNTLVWZ0;
|
||||
break;
|
||||
case X86::PT2RPNTLVWZ0T1V:
|
||||
Opc = X86::T2RPNTLVWZ0T1;
|
||||
break;
|
||||
case X86::PT2RPNTLVWZ1V:
|
||||
Opc = X86::T2RPNTLVWZ1;
|
||||
break;
|
||||
case X86::PT2RPNTLVWZ1T1V:
|
||||
Opc = X86::T2RPNTLVWZ1T1;
|
||||
break;
|
||||
default:
|
||||
llvm_unreachable("Impossible Opcode!");
|
||||
}
|
||||
MI.setDesc(TII->get(Opc));
|
||||
return true;
|
||||
}
|
||||
case X86::PTTRANSPOSEDV: {
|
||||
for (int i = 2; i > 0; --i)
|
||||
MI.removeOperand(i);
|
||||
MI.setDesc(TII->get(X86::TTRANSPOSED));
|
||||
return true;
|
||||
}
|
||||
case X86::PTCMMIMFP16PSV:
|
||||
case X86::PTCMMRLFP16PSV:
|
||||
case X86::PTDPBSSDV:
|
||||
|
@ -268,24 +268,36 @@ void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI,
|
||||
<< printReg(TileReg, TRI) << '\n');
|
||||
}
|
||||
|
||||
static unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) {
|
||||
if (Reg.isVirtual()) {
|
||||
unsigned RegClassID = MRI->getRegClass(Reg)->getID();
|
||||
if (RegClassID == X86::TILERegClassID)
|
||||
return 1;
|
||||
if (RegClassID == X86::TILEPAIRRegClassID)
|
||||
return 2;
|
||||
} else {
|
||||
if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
|
||||
return 1;
|
||||
if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
|
||||
return 2;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool isTileRegister(MachineRegisterInfo *MRI, Register VirtReg) {
|
||||
return getTileDefNum(MRI, VirtReg) > 0;
|
||||
}
|
||||
|
||||
static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
|
||||
// The instruction must have 3 operands: tile def, row, col.
|
||||
if (MI.isDebugInstr() || MI.getNumOperands() < 3 || !MI.isPseudo())
|
||||
return false;
|
||||
MachineOperand &MO = MI.getOperand(0);
|
||||
|
||||
if (MO.isReg()) {
|
||||
Register Reg = MO.getReg();
|
||||
// FIXME it may be used after Greedy RA and the physical
|
||||
// register is not rewritten yet.
|
||||
if (Reg.isVirtual() &&
|
||||
MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
|
||||
return true;
|
||||
if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
|
||||
return true;
|
||||
}
|
||||
if (!MO.isReg())
|
||||
return false;
|
||||
|
||||
return false;
|
||||
return getTileDefNum(MRI, MO.getReg()) > 0;
|
||||
}
|
||||
|
||||
static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) {
|
||||
@ -424,8 +436,7 @@ void X86FastPreTileConfig::convertPHI(MachineBasicBlock *MBB,
|
||||
|
||||
static bool isTileRegDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
|
||||
MachineOperand &MO = MI.getOperand(0);
|
||||
if (MO.isReg() && MO.getReg().isVirtual() &&
|
||||
MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID)
|
||||
if (MO.isReg() && MO.getReg().isVirtual() && isTileRegister(MRI, MO.getReg()))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
@ -524,8 +535,7 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
|
||||
if (!MO.isReg())
|
||||
continue;
|
||||
Register Reg = MO.getReg();
|
||||
if (Reg.isVirtual() &&
|
||||
MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
|
||||
if (Reg.isVirtual() && isTileRegister(MRI, Reg))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
@ -617,6 +627,19 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
|
||||
else if (dominates(MBB, LastShapeMI, ColMI))
|
||||
LastShapeMI = ColMI;
|
||||
}
|
||||
unsigned TileDefNum = getTileDefNum(MRI, MI.getOperand(0).getReg());
|
||||
if (TileDefNum > 1) {
|
||||
for (unsigned I = 1; I < TileDefNum; I++) {
|
||||
MachineOperand *ColxMO = &MI.getOperand(2 + I);
|
||||
MachineInstr *ColxMI = MRI->getVRegDef(ColxMO->getReg());
|
||||
if (ColxMI->getParent() == &MBB) {
|
||||
if (!LastShapeMI)
|
||||
LastShapeMI = ColxMI;
|
||||
else if (dominates(MBB, LastShapeMI, ColxMI))
|
||||
LastShapeMI = ColxMI;
|
||||
}
|
||||
}
|
||||
}
|
||||
// If there is user live out of the tilecfg, spill it and reload in
|
||||
// before the user.
|
||||
Register TileReg = MI.getOperand(0).getReg();
|
||||
|
@ -80,28 +80,41 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE,
|
||||
INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE,
|
||||
"Fast Tile Register Configure", false, false)
|
||||
|
||||
static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
|
||||
static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) {
|
||||
// There is no phi instruction after register allocation.
|
||||
assert(MI.isPHI() == false);
|
||||
// The instruction must have 3 operands: tile def, row, col.
|
||||
// It should be AMX pseudo instruction that have shape operand.
|
||||
if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 ||
|
||||
!MI.isPseudo())
|
||||
return false;
|
||||
return 0;
|
||||
MachineOperand &MO = MI.getOperand(0);
|
||||
|
||||
if (MO.isReg()) {
|
||||
Register Reg = MO.getReg();
|
||||
// FIXME it may be used after Greedy RA and the physical
|
||||
// FIXME: It may be used after Greedy RA and the physical
|
||||
// register is not rewritten yet.
|
||||
if (Reg.isVirtual() &&
|
||||
MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
|
||||
return true;
|
||||
if (Reg.isVirtual()) {
|
||||
if (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
|
||||
return 1;
|
||||
if (MRI->getRegClass(Reg)->getID() == X86::TILEPAIRRegClassID)
|
||||
return 2;
|
||||
}
|
||||
if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
|
||||
return true;
|
||||
return 1;
|
||||
if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
|
||||
return 2;
|
||||
}
|
||||
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned getTMMIndex(Register Reg) {
|
||||
if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
|
||||
return Reg - X86::TMM0;
|
||||
if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
|
||||
return (Reg - X86::TMM0_TMM1) * 2;
|
||||
llvm_unreachable("Invalid Tmm Reg!");
|
||||
}
|
||||
|
||||
// PreTileConfig should configure the tile registers based on basic
|
||||
@ -110,14 +123,17 @@ bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
|
||||
bool Change = false;
|
||||
SmallVector<std::pair<unsigned, ShapeT>, 6> ShapeInfos;
|
||||
for (MachineInstr &MI : reverse(MBB)) {
|
||||
if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::PLDTILECFGV)
|
||||
unsigned DefNum = getNumDefTiles(MRI, MI);
|
||||
if (DefNum == 0 && MI.getOpcode() != X86::PLDTILECFGV)
|
||||
continue;
|
||||
// AMX instructions that define tile register.
|
||||
if (MI.getOpcode() != X86::PLDTILECFGV) {
|
||||
MachineOperand &Row = MI.getOperand(1);
|
||||
MachineOperand &Col = MI.getOperand(2);
|
||||
unsigned TMMIdx = MI.getOperand(0).getReg() - X86::TMM0;
|
||||
ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)});
|
||||
unsigned TMMIdx = getTMMIndex(MI.getOperand(0).getReg());
|
||||
for (unsigned I = 0; I < DefNum; I++) {
|
||||
MachineOperand &Col = MI.getOperand(2 + I);
|
||||
ShapeInfos.push_back({TMMIdx + I, ShapeT(&Row, &Col)});
|
||||
}
|
||||
} else { // PLDTILECFGV
|
||||
// Rewrite the shape information to memory. Stack slot should have
|
||||
// been initialized to zero in pre config.
|
||||
|
@ -323,6 +323,35 @@ namespace {
|
||||
Segment = CurDAG->getRegister(0, MVT::i16);
|
||||
}
|
||||
|
||||
// Utility function to determine whether it is AMX SDNode right after
|
||||
// lowering but before ISEL.
|
||||
bool isAMXSDNode(SDNode *N) const {
|
||||
// Check if N is AMX SDNode:
|
||||
// 1. check specific opcode since these carry MVT::Untyped instead of
|
||||
// x86amx_type;
|
||||
// 2. check result type;
|
||||
// 3. check operand type;
|
||||
switch (N->getOpcode()) {
|
||||
default:
|
||||
break;
|
||||
case X86::PT2RPNTLVWZ0V:
|
||||
case X86::PT2RPNTLVWZ0T1V:
|
||||
case X86::PT2RPNTLVWZ1V:
|
||||
case X86::PT2RPNTLVWZ1T1V:
|
||||
return true;
|
||||
}
|
||||
for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
|
||||
if (N->getValueType(Idx) == MVT::x86amx)
|
||||
return true;
|
||||
}
|
||||
for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
|
||||
SDValue Op = N->getOperand(Idx);
|
||||
if (Op.getValueType() == MVT::x86amx)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Utility function to determine whether we should avoid selecting
|
||||
// immediate forms of instructions for better code size or not.
|
||||
// At a high level, we'd like to avoid such instructions when
|
||||
@ -5278,6 +5307,47 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
||||
ReplaceNode(Node, CNode);
|
||||
return;
|
||||
}
|
||||
case Intrinsic::x86_t2rpntlvwz0:
|
||||
case Intrinsic::x86_t2rpntlvwz0t1:
|
||||
case Intrinsic::x86_t2rpntlvwz1:
|
||||
case Intrinsic::x86_t2rpntlvwz1t1: {
|
||||
if (!Subtarget->hasAMXTRANSPOSE())
|
||||
break;
|
||||
auto *MFI =
|
||||
CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
|
||||
MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
|
||||
unsigned Opc;
|
||||
switch (IntNo) {
|
||||
default:
|
||||
llvm_unreachable("Unexpected intrinsic!");
|
||||
case Intrinsic::x86_t2rpntlvwz0:
|
||||
Opc = X86::PT2RPNTLVWZ0;
|
||||
break;
|
||||
case Intrinsic::x86_t2rpntlvwz0t1:
|
||||
Opc = X86::PT2RPNTLVWZ0T1;
|
||||
break;
|
||||
case Intrinsic::x86_t2rpntlvwz1:
|
||||
Opc = X86::PT2RPNTLVWZ1;
|
||||
break;
|
||||
case Intrinsic::x86_t2rpntlvwz1t1:
|
||||
Opc = X86::PT2RPNTLVWZ1T1;
|
||||
break;
|
||||
}
|
||||
// FIXME: Match displacement and scale.
|
||||
unsigned TIndex = Node->getConstantOperandVal(2);
|
||||
SDValue TReg = getI8Imm(TIndex, dl);
|
||||
SDValue Base = Node->getOperand(3);
|
||||
SDValue Scale = getI8Imm(1, dl);
|
||||
SDValue Index = Node->getOperand(4);
|
||||
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
|
||||
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
|
||||
SDValue Chain = Node->getOperand(0);
|
||||
MachineSDNode *CNode;
|
||||
SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
|
||||
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
|
||||
ReplaceNode(Node, CNode);
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -27291,6 +27291,53 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
|
||||
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
|
||||
Operation.getValue(1));
|
||||
}
|
||||
case Intrinsic::x86_t2rpntlvwz0_internal:
|
||||
case Intrinsic::x86_t2rpntlvwz0t1_internal:
|
||||
case Intrinsic::x86_t2rpntlvwz1_internal:
|
||||
case Intrinsic::x86_t2rpntlvwz1t1_internal: {
|
||||
if (!Subtarget.hasAMXTILE())
|
||||
break;
|
||||
auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
|
||||
X86MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
|
||||
unsigned IntNo = Op.getConstantOperandVal(1);
|
||||
unsigned Opc = 0;
|
||||
switch (IntNo) {
|
||||
default:
|
||||
llvm_unreachable("Unexpected intrinsic!");
|
||||
case Intrinsic::x86_t2rpntlvwz0_internal:
|
||||
Opc = X86::PT2RPNTLVWZ0V;
|
||||
break;
|
||||
case Intrinsic::x86_t2rpntlvwz0t1_internal:
|
||||
Opc = X86::PT2RPNTLVWZ0T1V;
|
||||
break;
|
||||
case Intrinsic::x86_t2rpntlvwz1_internal:
|
||||
Opc = X86::PT2RPNTLVWZ1V;
|
||||
break;
|
||||
case Intrinsic::x86_t2rpntlvwz1t1_internal:
|
||||
Opc = X86::PT2RPNTLVWZ1T1V;
|
||||
break;
|
||||
}
|
||||
|
||||
SDLoc DL(Op);
|
||||
SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
|
||||
|
||||
SDValue Ops[] = {Op.getOperand(2), // Row
|
||||
Op.getOperand(3), // Col0
|
||||
Op.getOperand(4), // Col1
|
||||
Op.getOperand(5), // Base
|
||||
DAG.getTargetConstant(1, DL, MVT::i8), // Scale
|
||||
Op.getOperand(6), // Index
|
||||
DAG.getTargetConstant(0, DL, MVT::i32), // Disp
|
||||
DAG.getRegister(0, MVT::i16), // Segment
|
||||
Op.getOperand(0)}; // Chain
|
||||
|
||||
MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
|
||||
SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
|
||||
SDValue(Res, 0));
|
||||
SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
|
||||
SDValue(Res, 0));
|
||||
return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
|
||||
}
|
||||
case Intrinsic::x86_atomic_bts_rm:
|
||||
case Intrinsic::x86_atomic_btc_rm:
|
||||
case Intrinsic::x86_atomic_btr_rm: {
|
||||
@ -37039,6 +37086,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
|
||||
assert (Imm < 8 && "Illegal tmm index");
|
||||
return X86::TMM0 + Imm;
|
||||
};
|
||||
auto TMMImmToTMMPair = [](unsigned Imm) {
|
||||
assert(Imm < 8 && "Illegal tmm pair index.");
|
||||
return X86::TMM0_TMM1 + Imm / 2;
|
||||
};
|
||||
switch (MI.getOpcode()) {
|
||||
default: llvm_unreachable("Unexpected instr type to insert");
|
||||
case X86::TLS_addr32:
|
||||
@ -37521,6 +37572,49 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
|
||||
MI.eraseFromParent(); // The pseudo is gone now.
|
||||
return BB;
|
||||
}
|
||||
case X86::PT2RPNTLVWZ0:
|
||||
case X86::PT2RPNTLVWZ0T1:
|
||||
case X86::PT2RPNTLVWZ1:
|
||||
case X86::PT2RPNTLVWZ1T1: {
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
unsigned Opc;
|
||||
switch (MI.getOpcode()) {
|
||||
default:
|
||||
llvm_unreachable("Unexpected instruction!");
|
||||
case X86::PT2RPNTLVWZ0:
|
||||
Opc = X86::T2RPNTLVWZ0;
|
||||
break;
|
||||
case X86::PT2RPNTLVWZ0T1:
|
||||
Opc = X86::T2RPNTLVWZ0T1;
|
||||
break;
|
||||
case X86::PT2RPNTLVWZ1:
|
||||
Opc = X86::T2RPNTLVWZ1;
|
||||
break;
|
||||
case X86::PT2RPNTLVWZ1T1:
|
||||
Opc = X86::T2RPNTLVWZ1T1;
|
||||
break;
|
||||
}
|
||||
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
|
||||
MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
|
||||
|
||||
MIB.add(MI.getOperand(1)); // base
|
||||
MIB.add(MI.getOperand(2)); // scale
|
||||
MIB.add(MI.getOperand(3)); // index
|
||||
MIB.add(MI.getOperand(4)); // displacement
|
||||
MIB.add(MI.getOperand(5)); // segment
|
||||
MI.eraseFromParent(); // The pseudo is gone now.
|
||||
return BB;
|
||||
}
|
||||
case X86::PTTRANSPOSED: {
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
|
||||
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::TTRANSPOSED));
|
||||
MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
|
||||
MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
|
||||
|
||||
MI.eraseFromParent(); // The pseudo is gone now.
|
||||
return BB;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -306,3 +306,66 @@ let Predicates = [HasAMXFP8, In64BitMode] in {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSystem] in {
|
||||
let mayStore = 1 in
|
||||
def PTILEPAIRSTORE : PseudoI<(outs), (ins opaquemem:$src1, TILEPair:$src2), []>;
|
||||
let mayLoad = 1 in
|
||||
def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>;
|
||||
}
|
||||
|
||||
let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
|
||||
let SchedRW = [WriteSystem] in {
|
||||
def T2RPNTLVWZ0 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
|
||||
(ins sibmem:$src), "t2rpntlvwz0\t{$src, $dst|$dst, $src}",
|
||||
[]>, VEX, WIG, T8,PS;
|
||||
|
||||
def T2RPNTLVWZ0T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst),
|
||||
(ins sibmem:$src), "t2rpntlvwz0t1\t{$src, $dst|$dst, $src}",
|
||||
[]>, VEX, T8,PS;
|
||||
|
||||
def T2RPNTLVWZ1 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
|
||||
(ins sibmem:$src), "t2rpntlvwz1\t{$src, $dst|$dst, $src}",
|
||||
[]>, VEX, T8,PD;
|
||||
|
||||
def T2RPNTLVWZ1T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst),
|
||||
(ins sibmem:$src), "t2rpntlvwz1t1\t{$src, $dst|$dst, $src}",
|
||||
[]>, VEX, T8,PD;
|
||||
|
||||
def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src),
|
||||
"ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8,XS;
|
||||
let isPseudo = true in {
|
||||
def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst),
|
||||
(ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
|
||||
[]>;
|
||||
def PT2RPNTLVWZ0T1V : PseudoI<(outs TILEPair:$dst),
|
||||
(ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
|
||||
[]>;
|
||||
def PT2RPNTLVWZ1V : PseudoI<(outs TILEPair:$dst),
|
||||
(ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
|
||||
[]>;
|
||||
def PT2RPNTLVWZ1T1V : PseudoI<(outs TILEPair:$dst),
|
||||
(ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
|
||||
[]>;
|
||||
}
|
||||
|
||||
def PTTRANSPOSEDV : PseudoI<(outs TILE:$dst),
|
||||
(ins GR16:$src1, GR16:$src2, TILE:$src),
|
||||
[(set TILE: $dst,
|
||||
(int_x86_ttransposed_internal GR16:$src1, GR16:$src2,
|
||||
TILE:$src))]>;
|
||||
|
||||
let usesCustomInserter = 1 in {
|
||||
def PT2RPNTLVWZ0 : PseudoI<(outs), (ins u8imm:$dst,
|
||||
sibmem:$src1), []>;
|
||||
def PT2RPNTLVWZ0T1 : PseudoI<(outs), (ins u8imm:$dst,
|
||||
sibmem:$src1), []>;
|
||||
def PT2RPNTLVWZ1 : PseudoI<(outs), (ins u8imm:$dst,
|
||||
sibmem:$src1), []>;
|
||||
def PT2RPNTLVWZ1T1 : PseudoI<(outs), (ins u8imm:$dst,
|
||||
sibmem:$src1), []>;
|
||||
def PTTRANSPOSED : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src),
|
||||
[(int_x86_ttransposed timm:$dst, timm:$src)]>;
|
||||
}
|
||||
}
|
||||
} // HasAMXTILE, HasAMXTRANSPOSE
|
||||
|
@ -4538,6 +4538,11 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
|
||||
return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
|
||||
: GET_EGPR_IF_ENABLED(X86::TILESTORED);
|
||||
#undef GET_EGPR_IF_ENABLED
|
||||
case 2048:
|
||||
assert(X86::TILEPAIRRegClass.hasSubClassEq(RC) &&
|
||||
"Unknown 2048-byte regclass");
|
||||
assert(STI.hasAMXTILE() && "Using 2048-bit register requires AMX-TILE");
|
||||
return Load ? X86::PTILEPAIRLOAD : X86::PTILEPAIRSTORE;
|
||||
}
|
||||
}
|
||||
|
||||
@ -4732,6 +4737,7 @@ static bool isAMXOpcode(unsigned Opc) {
|
||||
case X86::TILESTORED:
|
||||
case X86::TILELOADD_EVEX:
|
||||
case X86::TILESTORED_EVEX:
|
||||
case X86::PTILEPAIRLOAD:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -4744,7 +4750,8 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
|
||||
default:
|
||||
llvm_unreachable("Unexpected special opcode!");
|
||||
case X86::TILESTORED:
|
||||
case X86::TILESTORED_EVEX: {
|
||||
case X86::TILESTORED_EVEX:
|
||||
case X86::PTILEPAIRSTORE: {
|
||||
// tilestored %tmm, (%sp, %idx)
|
||||
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
|
||||
Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
|
||||
@ -4758,7 +4765,8 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
|
||||
break;
|
||||
}
|
||||
case X86::TILELOADD:
|
||||
case X86::TILELOADD_EVEX: {
|
||||
case X86::TILELOADD_EVEX:
|
||||
case X86::PTILEPAIRLOAD: {
|
||||
// tileloadd (%sp, %idx), %tmm
|
||||
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
|
||||
Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
|
||||
|
@ -501,3 +501,10 @@ def VK8Pair : RegisterOperand<VK8PAIR, "printVKPair"> {
|
||||
def VK16Pair : RegisterOperand<VK16PAIR, "printVKPair"> {
|
||||
let ParserMatchClass = VK16PairAsmOperand;
|
||||
}
|
||||
|
||||
let RenderMethod = "addTILEPairOperands" in
|
||||
def TILEPairAsmOperand : AsmOperandClass { let Name = "TILEPair"; }
|
||||
|
||||
def TILEPair : RegisterOperand<TILEPAIR, "printTILEPair"> {
|
||||
let ParserMatchClass = TILEPairAsmOperand;
|
||||
}
|
||||
|
@ -184,6 +184,7 @@ def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">;
|
||||
def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">;
|
||||
def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">;
|
||||
def HasAMXFP8 : Predicate<"Subtarget->hasAMXFP8()">;
|
||||
def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">;
|
||||
def HasUINTR : Predicate<"Subtarget->hasUINTR()">;
|
||||
def HasUSERMSR : Predicate<"Subtarget->hasUSERMSR()">;
|
||||
def HasCRC32 : Predicate<"Subtarget->hasCRC32()">;
|
||||
|
@ -74,6 +74,22 @@ static bool isAMXCast(Instruction *II) {
|
||||
match(II, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(m_Value()));
|
||||
}
|
||||
|
||||
// Some instructions may return more than one tiles.
|
||||
// e.g: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal
|
||||
static unsigned getNumDefTiles(IntrinsicInst *II) {
|
||||
Type *Ty = II->getType();
|
||||
if (Ty->isX86_AMXTy())
|
||||
return 1;
|
||||
|
||||
unsigned Num = 0;
|
||||
for (unsigned i = 0; i < Ty->getNumContainedTypes(); i++) {
|
||||
Type *STy = Ty->getContainedType(i);
|
||||
if (STy->isX86_AMXTy())
|
||||
Num++;
|
||||
}
|
||||
return Num;
|
||||
}
|
||||
|
||||
static bool isAMXIntrinsic(Value *I) {
|
||||
auto *II = dyn_cast<IntrinsicInst>(I);
|
||||
if (!II)
|
||||
@ -82,7 +98,7 @@ static bool isAMXIntrinsic(Value *I) {
|
||||
return false;
|
||||
// Check if return type or parameter is x86_amx. If it is x86_amx
|
||||
// the intrinsic must be x86 amx intrinsics.
|
||||
if (II->getType()->isX86_AMXTy())
|
||||
if (getNumDefTiles(II) > 0)
|
||||
return true;
|
||||
for (Value *V : II->args()) {
|
||||
if (V->getType()->isX86_AMXTy())
|
||||
@ -121,12 +137,96 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) {
|
||||
llvm_unreachable("No terminator in the entry block!");
|
||||
}
|
||||
|
||||
static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
|
||||
class ShapeCalculator {
|
||||
private:
|
||||
TargetMachine *TM = nullptr;
|
||||
|
||||
// In AMX intrinsics we let Shape = {Row, Col}, but the
|
||||
// RealCol = Col / ElementSize. We may use the RealCol
|
||||
// as a new Row for other new created AMX intrinsics.
|
||||
std::map<Value *, Value *> Col2Row, Row2Col;
|
||||
|
||||
public:
|
||||
ShapeCalculator(TargetMachine *TargetM) : TM(TargetM) {}
|
||||
std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo);
|
||||
std::pair<Value *, Value *> getShape(PHINode *Phi);
|
||||
Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity);
|
||||
Value *getColFromRow(Instruction *II, Value *V, unsigned Granularity);
|
||||
};
|
||||
|
||||
Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V,
|
||||
unsigned Granularity) {
|
||||
if (Col2Row.count(V))
|
||||
return Col2Row[V];
|
||||
IRBuilder<> Builder(II);
|
||||
Value *RealRow = nullptr;
|
||||
if (isa<ConstantInt>(V))
|
||||
RealRow =
|
||||
Builder.getInt16((cast<ConstantInt>(V)->getSExtValue()) / Granularity);
|
||||
else if (isa<Instruction>(V)) {
|
||||
// When it is not a const value and it is not a function argument, we
|
||||
// create Row after the definition of V instead of
|
||||
// before II. For example, II is %118, we try to getshape for %117:
|
||||
// %117 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x
|
||||
// i32> %115).
|
||||
// %118 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16
|
||||
// %104, i16 %105, i16 %106, x86_amx %110, x86_amx %114, x86_amx
|
||||
// %117).
|
||||
// If we create %row = udiv i16 %106, 4 before %118(aka. II), then its
|
||||
// definition is after its user(new tileload for %117).
|
||||
// So, the best choice is to create %row right after the definition of
|
||||
// %106.
|
||||
Builder.SetInsertPoint(cast<Instruction>(V));
|
||||
RealRow = Builder.CreateUDiv(V, Builder.getInt16(4));
|
||||
cast<Instruction>(RealRow)->moveAfter(cast<Instruction>(V));
|
||||
} else {
|
||||
// When it is not a const value and it is a function argument, we create
|
||||
// Row at the entry bb.
|
||||
IRBuilder<> NewBuilder(
|
||||
getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
|
||||
RealRow = NewBuilder.CreateUDiv(V, NewBuilder.getInt16(Granularity));
|
||||
}
|
||||
Col2Row[V] = RealRow;
|
||||
return RealRow;
|
||||
}
|
||||
|
||||
Value *ShapeCalculator::getColFromRow(Instruction *II, Value *V,
|
||||
unsigned Granularity) {
|
||||
if (Row2Col.count(V))
|
||||
return Row2Col[V];
|
||||
IRBuilder<> Builder(II);
|
||||
Value *RealCol = nullptr;
|
||||
if (isa<ConstantInt>(V))
|
||||
RealCol =
|
||||
Builder.getInt16((cast<ConstantInt>(V)->getSExtValue()) * Granularity);
|
||||
else if (isa<Instruction>(V)) {
|
||||
Builder.SetInsertPoint(cast<Instruction>(V));
|
||||
RealCol = Builder.CreateNUWMul(V, Builder.getInt16(Granularity));
|
||||
cast<Instruction>(RealCol)->moveAfter(cast<Instruction>(V));
|
||||
} else {
|
||||
// When it is not a const value and it is a function argument, we create
|
||||
// Row at the entry bb.
|
||||
IRBuilder<> NewBuilder(
|
||||
getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
|
||||
RealCol = NewBuilder.CreateNUWMul(V, NewBuilder.getInt16(Granularity));
|
||||
}
|
||||
Row2Col[V] = RealCol;
|
||||
return RealCol;
|
||||
}
|
||||
|
||||
// TODO: Refine the row and col-in-bytes of tile to row and col of matrix.
|
||||
std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
|
||||
unsigned OpNo) {
|
||||
(void)TM;
|
||||
IRBuilder<> Builder(II);
|
||||
Value *Row = nullptr, *Col = nullptr;
|
||||
switch (II->getIntrinsicID()) {
|
||||
default:
|
||||
llvm_unreachable("Expect amx intrinsics");
|
||||
case Intrinsic::x86_t2rpntlvwz0_internal:
|
||||
case Intrinsic::x86_t2rpntlvwz0t1_internal:
|
||||
case Intrinsic::x86_t2rpntlvwz1_internal:
|
||||
case Intrinsic::x86_t2rpntlvwz1t1_internal:
|
||||
case Intrinsic::x86_tileloadd64_internal:
|
||||
case Intrinsic::x86_tileloaddt164_internal:
|
||||
case Intrinsic::x86_tilestored64_internal: {
|
||||
@ -154,43 +254,24 @@ static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
|
||||
Col = II->getArgOperand(2);
|
||||
break;
|
||||
case 5:
|
||||
if (isa<ConstantInt>(II->getArgOperand(2)))
|
||||
Row = Builder.getInt16(
|
||||
(cast<ConstantInt>(II->getOperand(2))->getSExtValue()) / 4);
|
||||
else if (isa<Instruction>(II->getArgOperand(2))) {
|
||||
// When it is not a const value and it is not a function argument, we
|
||||
// create Row after the definition of II->getOperand(2) instead of
|
||||
// before II. For example, II is %118, we try to getshape for %117:
|
||||
// %117 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x
|
||||
// i32> %115).
|
||||
// %118 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16
|
||||
// %104, i16 %105, i16 %106, x86_amx %110, x86_amx %114, x86_amx
|
||||
// %117).
|
||||
// If we create %row = udiv i16 %106, 4 before %118(aka. II), then its
|
||||
// definition is after its user(new tileload for %117).
|
||||
// So, the best choice is to create %row right after the definition of
|
||||
// %106.
|
||||
Builder.SetInsertPoint(cast<Instruction>(II->getOperand(2)));
|
||||
Row = Builder.CreateUDiv(II->getOperand(2), Builder.getInt16(4));
|
||||
cast<Instruction>(Row)->moveAfter(cast<Instruction>(II->getOperand(2)));
|
||||
} else {
|
||||
// When it is not a const value and it is a function argument, we create
|
||||
// Row at the entry bb.
|
||||
IRBuilder<> NewBuilder(
|
||||
getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
|
||||
Row = NewBuilder.CreateUDiv(II->getOperand(2), NewBuilder.getInt16(4));
|
||||
}
|
||||
Row = getRowFromCol(II, II->getArgOperand(2), 4);
|
||||
Col = II->getArgOperand(1);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case Intrinsic::x86_ttransposed_internal: {
|
||||
assert((OpNo == 2) && "Illegal Operand Number.");
|
||||
Row = getRowFromCol(II, II->getArgOperand(1), 4);
|
||||
Col = getColFromRow(II, II->getArgOperand(0), 4);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return std::make_pair(Row, Col);
|
||||
}
|
||||
|
||||
static std::pair<Value *, Value *> getShape(PHINode *Phi) {
|
||||
std::pair<Value *, Value *> ShapeCalculator::getShape(PHINode *Phi) {
|
||||
Use &U = *(Phi->use_begin());
|
||||
unsigned OpNo = U.getOperandNo();
|
||||
User *V = U.getUser();
|
||||
@ -223,14 +304,15 @@ static std::pair<Value *, Value *> getShape(PHINode *Phi) {
|
||||
namespace {
|
||||
class X86LowerAMXType {
|
||||
Function &Func;
|
||||
ShapeCalculator *SC;
|
||||
|
||||
// In AMX intrinsics we let Shape = {Row, Col}, but the
|
||||
// RealCol = Col / ElementSize. We may use the RealCol
|
||||
// as a new Row for other new created AMX intrinsics.
|
||||
std::map<Value *, Value *> Col2Row;
|
||||
std::map<Value *, Value *> Col2Row, Row2Col;
|
||||
|
||||
public:
|
||||
X86LowerAMXType(Function &F) : Func(F) {}
|
||||
X86LowerAMXType(Function &F, ShapeCalculator *ShapeC) : Func(F), SC(ShapeC) {}
|
||||
bool visit();
|
||||
void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast);
|
||||
void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST);
|
||||
@ -247,7 +329,7 @@ void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
|
||||
Use &U = *(Bitcast->use_begin());
|
||||
unsigned OpNo = U.getOperandNo();
|
||||
auto *II = cast<IntrinsicInst>(U.getUser());
|
||||
std::tie(Row, Col) = getShape(II, OpNo);
|
||||
std::tie(Row, Col) = SC->getShape(II, OpNo);
|
||||
IRBuilder<> Builder(Bitcast);
|
||||
// Use the maximun column as stride.
|
||||
Value *Stride = Builder.getInt64(64);
|
||||
@ -327,7 +409,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
|
||||
Builder.CreateStore(Src, AllocaAddr);
|
||||
// TODO we can pick an constant operand for the shape.
|
||||
Value *Row = nullptr, *Col = nullptr;
|
||||
std::tie(Row, Col) = getShape(II, OpNo);
|
||||
std::tie(Row, Col) = SC->getShape(II, OpNo);
|
||||
std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
|
||||
Value *NewInst =
|
||||
Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, {}, Args);
|
||||
@ -467,10 +549,18 @@ static Value *getAllocaPos(BasicBlock *BB) {
|
||||
|
||||
static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) {
|
||||
assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!");
|
||||
auto *II = cast<IntrinsicInst>(TileDef);
|
||||
auto *II = dyn_cast<IntrinsicInst>(TileDef);
|
||||
unsigned Idx = 0;
|
||||
// Extract tile from multiple tiles' def.
|
||||
if (auto *Extr = dyn_cast<ExtractValueInst>(TileDef)) {
|
||||
assert(Extr->hasIndices() && "Tile extract miss index!");
|
||||
Idx = Extr->getIndices()[0];
|
||||
II = cast<IntrinsicInst>(Extr->getOperand(0));
|
||||
}
|
||||
|
||||
assert(II && "Not tile intrinsic!");
|
||||
Value *Row = II->getOperand(0);
|
||||
Value *Col = II->getOperand(1);
|
||||
Value *Row = II->getOperand(Idx);
|
||||
Value *Col = II->getOperand(Idx + 1);
|
||||
|
||||
BasicBlock *BB = TileDef->getParent();
|
||||
BasicBlock::iterator Iter = TileDef->getIterator();
|
||||
@ -489,14 +579,20 @@ static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) {
|
||||
|
||||
// Get tile shape.
|
||||
IntrinsicInst *II = nullptr;
|
||||
unsigned Idx = 0;
|
||||
if (IsPHI) {
|
||||
Value *PhiOp = cast<PHINode>(V)->getIncomingValue(0);
|
||||
II = cast<IntrinsicInst>(PhiOp);
|
||||
} else if (auto *Extr = dyn_cast<ExtractValueInst>(V)) {
|
||||
// Extract tile from multiple tiles' def.
|
||||
assert(Extr->hasIndices() && "Tile extract miss index!");
|
||||
Idx = Extr->getIndices()[0];
|
||||
II = cast<IntrinsicInst>(Extr->getOperand(0));
|
||||
} else {
|
||||
II = cast<IntrinsicInst>(V);
|
||||
}
|
||||
Value *Row = II->getOperand(0);
|
||||
Value *Col = II->getOperand(1);
|
||||
Value *Row = II->getOperand(Idx);
|
||||
Value *Col = II->getOperand(Idx + 1);
|
||||
|
||||
Instruction *UserI = cast<Instruction>(U.getUser());
|
||||
IRBuilder<> Builder(UserI);
|
||||
@ -707,10 +803,12 @@ namespace {
|
||||
|
||||
class X86LowerAMXCast {
|
||||
Function &Func;
|
||||
ShapeCalculator *SC;
|
||||
std::unique_ptr<DominatorTree> DT;
|
||||
|
||||
public:
|
||||
X86LowerAMXCast(Function &F) : Func(F), DT(nullptr) {}
|
||||
X86LowerAMXCast(Function &F, ShapeCalculator *ShapeC)
|
||||
: Func(F), SC(ShapeC), DT(nullptr) {}
|
||||
bool combineCastStore(IntrinsicInst *Cast, StoreInst *ST);
|
||||
bool combineLoadCast(IntrinsicInst *Cast, LoadInst *LD);
|
||||
bool combineLdSt(SmallVectorImpl<Instruction *> &Casts);
|
||||
@ -788,7 +886,7 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
|
||||
if (!isa<UndefValue>(IncValue) && !IncConst->isZeroValue())
|
||||
return false;
|
||||
Value *Row = nullptr, *Col = nullptr;
|
||||
std::tie(Row, Col) = getShape(OldPN);
|
||||
std::tie(Row, Col) = SC->getShape(OldPN);
|
||||
// TODO: If it is not constant the Row and Col must domoniate tilezero
|
||||
// that we are going to create.
|
||||
if (!Row || !Col || !isa<Constant>(Row) || !isa<Constant>(Col))
|
||||
@ -919,6 +1017,19 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
|
||||
return true;
|
||||
}
|
||||
|
||||
static Value *getShapeFromAMXIntrinsic(Value *Inst, unsigned ShapeIdx,
|
||||
bool IsRow) {
|
||||
if (!isAMXIntrinsic(Inst))
|
||||
return nullptr;
|
||||
|
||||
auto *II = cast<IntrinsicInst>(Inst);
|
||||
if (IsRow)
|
||||
return II->getOperand(0);
|
||||
|
||||
assert(ShapeIdx < 2 && "Currently 2 shapes in 1 instruction at most!");
|
||||
return II->getOperand(ShapeIdx + 1);
|
||||
}
|
||||
|
||||
// %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %42)
|
||||
// store <256 x i32> %43, <256 x i32>* %p, align 64
|
||||
// -->
|
||||
@ -926,16 +1037,46 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
|
||||
// i64 64, x86_amx %42)
|
||||
bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) {
|
||||
Value *Tile = Cast->getOperand(0);
|
||||
// TODO: If it is cast intrinsic or phi node, we can propagate the
|
||||
// shape information through def-use chain.
|
||||
if (!isAMXIntrinsic(Tile))
|
||||
|
||||
assert(Tile->getType()->isX86_AMXTy() && "Not Tile Operand!");
|
||||
|
||||
// TODO: Specially handle the multi-use case.
|
||||
if (Tile->getNumUses() != 1)
|
||||
return false;
|
||||
auto *II = cast<IntrinsicInst>(Tile);
|
||||
// Tile is output from AMX intrinsic. The first operand of the
|
||||
// intrinsic is row, the second operand of the intrinsic is column.
|
||||
Value *Row = II->getOperand(0);
|
||||
Value *Col = II->getOperand(1);
|
||||
|
||||
// We don't fetch shape from tilestore, we only get shape from tiledef,
|
||||
// so we can set the max tile shape to tilestore for special cases.
|
||||
IRBuilder<> Builder(ST);
|
||||
Value *Row = nullptr;
|
||||
Value *Col = nullptr;
|
||||
|
||||
if (isAMXIntrinsic(Tile)) {
|
||||
auto *II = cast<IntrinsicInst>(Tile);
|
||||
// Tile is output from AMX intrinsic. The first operand of the
|
||||
// intrinsic is row, the second operand of the intrinsic is column.
|
||||
Row = II->getOperand(0);
|
||||
Col = II->getOperand(1);
|
||||
} else {
|
||||
// Now we supported multi-tiles value in structure, so we may get tile
|
||||
// from extracting multi-tiles structure.
|
||||
// For example:
|
||||
// %6 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %1,
|
||||
// i16 %2, i16 %3, i8* %4, i64 %5)
|
||||
// %7 = extractvalue { x86_amx, x86_amx } %6, 0
|
||||
// %8 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %7)
|
||||
// store <256 x i32> %8, <256 x i32>* %0, align 1024
|
||||
//
|
||||
// TODO: Currently we only handle extractvalue case, enhance me for other
|
||||
// cases if possible.
|
||||
auto *II = cast<ExtractValueInst>(Tile);
|
||||
assert(II && "We meet unhandle source in fetching tile value!");
|
||||
unsigned ShapeIdx = II->getIndices()[0];
|
||||
Value *Tiles = II->getOperand(0);
|
||||
Row = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, true);
|
||||
Col = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, false);
|
||||
}
|
||||
assert(Row && Col && "Shape got failed!");
|
||||
|
||||
// Stride should be equal to col(measured by bytes)
|
||||
Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty());
|
||||
Value *I8Ptr = Builder.CreateBitCast(ST->getOperand(1), Builder.getPtrTy());
|
||||
@ -959,7 +1100,7 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) {
|
||||
// shape information through def-use chain.
|
||||
if (!isAMXIntrinsic(II))
|
||||
return false;
|
||||
std::tie(Row, Col) = getShape(II, OpNo);
|
||||
std::tie(Row, Col) = SC->getShape(II, OpNo);
|
||||
IRBuilder<> Builder(LD);
|
||||
// Stride should be equal to col(measured by bytes)
|
||||
Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty());
|
||||
@ -1169,7 +1310,7 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) {
|
||||
Builder.CreateStore(Src, AllocaAddr);
|
||||
// TODO we can pick an constant operand for the shape.
|
||||
Value *Row = nullptr, *Col = nullptr;
|
||||
std::tie(Row, Col) = getShape(II, OpNo);
|
||||
std::tie(Row, Col) = SC->getShape(II, OpNo);
|
||||
std::array<Value *, 4> Args = {
|
||||
Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())};
|
||||
Value *NewInst =
|
||||
@ -1245,13 +1386,14 @@ public:
|
||||
TargetLibraryInfo *TLI =
|
||||
&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
|
||||
|
||||
X86LowerAMXCast LAC(F);
|
||||
ShapeCalculator SC(TM);
|
||||
X86LowerAMXCast LAC(F, &SC);
|
||||
C |= LAC.combineAMXcast(TLI);
|
||||
// There might be remaining AMXcast after combineAMXcast and they should be
|
||||
// handled elegantly.
|
||||
C |= LAC.transformAllAMXCast();
|
||||
|
||||
X86LowerAMXType LAT(F);
|
||||
X86LowerAMXType LAT(F, &SC);
|
||||
C |= LAT.visit();
|
||||
|
||||
// Prepare for fast register allocation at O0.
|
||||
|
@ -118,16 +118,27 @@ class X86PreTileConfig : public MachineFunctionPass {
|
||||
bool isAMXInstruction(MachineInstr &MI) {
|
||||
if (MI.isPHI() || MI.isDebugInstr() || MI.getNumOperands() < 3)
|
||||
return false;
|
||||
MachineOperand &MO = MI.getOperand(0);
|
||||
|
||||
// PTILESTOREDV is the only exception that doesn't def a AMX register.
|
||||
if (MI.getOpcode() == X86::PTILESTOREDV)
|
||||
return true;
|
||||
|
||||
// We can simply check if it is AMX instruction by its def.
|
||||
// But we should exclude old API which uses physical registers.
|
||||
if (MO.isReg() && MO.getReg().isVirtual() &&
|
||||
MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) {
|
||||
collectShapeInfo(MI);
|
||||
return true;
|
||||
}
|
||||
// PTILESTOREDV is the only exception that doesn't def a AMX register.
|
||||
return MI.getOpcode() == X86::PTILESTOREDV;
|
||||
MachineOperand &MO = MI.getOperand(0);
|
||||
if (!MO.isReg() || !MO.getReg().isVirtual())
|
||||
return false;
|
||||
|
||||
unsigned Shapes = 0;
|
||||
if (MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID)
|
||||
Shapes = 1;
|
||||
if (MRI->getRegClass(MO.getReg())->getID() == X86::TILEPAIRRegClassID)
|
||||
Shapes = 2;
|
||||
if (!Shapes)
|
||||
return false;
|
||||
|
||||
collectShapeInfo(MI, Shapes);
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Check if it is an edge from loop bottom to loop head.
|
||||
@ -142,7 +153,7 @@ class X86PreTileConfig : public MachineFunctionPass {
|
||||
}
|
||||
|
||||
/// Collect the shape def information for later use.
|
||||
void collectShapeInfo(MachineInstr &MI);
|
||||
void collectShapeInfo(MachineInstr &MI, unsigned Shapes);
|
||||
|
||||
/// Try to hoist shapes definded below AMX instructions.
|
||||
bool hoistShapesInBB(MachineBasicBlock *MBB, SmallVectorImpl<MIRef> &Shapes) {
|
||||
@ -208,7 +219,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
|
||||
INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
|
||||
"Tile Register Pre-configure", false, false)
|
||||
|
||||
void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) {
|
||||
void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) {
|
||||
auto RecordShape = [&](MachineInstr *MI, MachineBasicBlock *MBB) {
|
||||
MIRef MIR(MI, MBB);
|
||||
auto I = llvm::lower_bound(ShapeBBs[MBB], MIR);
|
||||
@ -216,8 +227,10 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) {
|
||||
ShapeBBs[MBB].insert(I, MIR);
|
||||
};
|
||||
|
||||
SmallVector<Register, 8> WorkList(
|
||||
{MI.getOperand(1).getReg(), MI.getOperand(2).getReg()});
|
||||
// All shapes have same row in multi-tile operand.
|
||||
SmallVector<Register, 8> WorkList;
|
||||
for (unsigned I = 1; I < Shapes + 2; ++I)
|
||||
WorkList.push_back(MI.getOperand(I).getReg());
|
||||
while (!WorkList.empty()) {
|
||||
Register R = WorkList.pop_back_val();
|
||||
MachineInstr *DefMI = MRI->getVRegDef(R);
|
||||
@ -225,6 +238,14 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) {
|
||||
MachineBasicBlock *DefMBB = DefMI->getParent();
|
||||
if (DefMI->isMoveImmediate() || !DefVisited.insert(DefMI).second)
|
||||
continue;
|
||||
|
||||
// This happens when column = 0 in multi-tile operand.
|
||||
if (DefMI->getOpcode() == X86::COPY) {
|
||||
MachineInstr *MI = MRI->getVRegDef(DefMI->getOperand(1).getReg());
|
||||
if (MI && MI->isMoveImmediate())
|
||||
continue;
|
||||
}
|
||||
|
||||
if (DefMI->isPHI()) {
|
||||
for (unsigned I = 1; I < DefMI->getNumOperands(); I += 2)
|
||||
if (isLoopBackEdge(DefMBB, DefMI->getOperand(I + 1).getMBB()))
|
||||
|
@ -642,6 +642,10 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
|
||||
Reserved.set(*AI);
|
||||
}
|
||||
|
||||
// Reserve low half pair registers in case they are used by RA aggressively.
|
||||
Reserved.set(X86::TMM0_TMM1);
|
||||
Reserved.set(X86::TMM2_TMM3);
|
||||
|
||||
assert(checkAllSuperRegsMarked(Reserved,
|
||||
{X86::SIL, X86::DIL, X86::BPL, X86::SPL,
|
||||
X86::SIH, X86::DIH, X86::BPH, X86::SPH}));
|
||||
@ -662,7 +666,7 @@ unsigned X86RegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const {
|
||||
// and try to return the minimum number of registers supported by the target.
|
||||
static_assert((X86::R15WH + 1 == X86::YMM0) && (X86::YMM15 + 1 == X86::K0) &&
|
||||
(X86::K6_K7 + 1 == X86::TMMCFG) &&
|
||||
(X86::TMM7 + 1 == X86::R16) &&
|
||||
(X86::TMM6_TMM7 + 1 == X86::R16) &&
|
||||
(X86::R31WH + 1 == X86::NUM_TARGET_REGS),
|
||||
"Register number may be incorrect");
|
||||
|
||||
@ -735,7 +739,8 @@ bool X86RegisterInfo::isFixedRegister(const MachineFunction &MF,
|
||||
}
|
||||
|
||||
bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const {
|
||||
return RC->getID() == X86::TILERegClassID;
|
||||
return RC->getID() == X86::TILERegClassID ||
|
||||
RC->getID() == X86::TILEPAIRRegClassID;
|
||||
}
|
||||
|
||||
void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
|
||||
@ -1073,12 +1078,59 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
|
||||
case X86::PTDPFP16PSV:
|
||||
case X86::PTCMMIMFP16PSV:
|
||||
case X86::PTCMMRLFP16PSV:
|
||||
case X86::PTTRANSPOSEDV: {
|
||||
MachineOperand &MO1 = MI->getOperand(1);
|
||||
MachineOperand &MO2 = MI->getOperand(2);
|
||||
ShapeT Shape(&MO1, &MO2, MRI);
|
||||
VRM->assignVirt2Shape(VirtReg, Shape);
|
||||
return Shape;
|
||||
}
|
||||
case X86::PT2RPNTLVWZ0V:
|
||||
case X86::PT2RPNTLVWZ0T1V:
|
||||
case X86::PT2RPNTLVWZ1V:
|
||||
case X86::PT2RPNTLVWZ1T1V: {
|
||||
MachineOperand &MO1 = MI->getOperand(1);
|
||||
MachineOperand &MO2 = MI->getOperand(2);
|
||||
MachineOperand &MO3 = MI->getOperand(3);
|
||||
ShapeT Shape({&MO1, &MO2, &MO1, &MO3}, MRI);
|
||||
VRM->assignVirt2Shape(VirtReg, Shape);
|
||||
return Shape;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static bool canHintShape(ShapeT &PhysShape, ShapeT &VirtShape) {
|
||||
unsigned PhysShapeNum = PhysShape.getShapeNum();
|
||||
unsigned VirtShapeNum = VirtShape.getShapeNum();
|
||||
|
||||
if (PhysShapeNum < VirtShapeNum)
|
||||
return false;
|
||||
|
||||
if (PhysShapeNum == VirtShapeNum) {
|
||||
if (PhysShapeNum == 1)
|
||||
return PhysShape == VirtShape;
|
||||
|
||||
for (unsigned I = 0; I < PhysShapeNum; I++) {
|
||||
ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I));
|
||||
ShapeT VShape(VirtShape.getRow(I), VirtShape.getCol(I));
|
||||
if (VShape != PShape)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Hint subreg of mult-tile reg to single tile reg.
|
||||
if (VirtShapeNum == 1) {
|
||||
for (unsigned I = 0; I < PhysShapeNum; I++) {
|
||||
ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I));
|
||||
if (VirtShape == PShape)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: Currently we have no requirement for case of
|
||||
// (VirtShapeNum > 1 and PhysShapeNum > VirtShapeNum)
|
||||
return false;
|
||||
}
|
||||
|
||||
bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
|
||||
@ -1099,7 +1151,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
|
||||
if (!VRM)
|
||||
return BaseImplRetVal;
|
||||
|
||||
if (ID != X86::TILERegClassID) {
|
||||
if (ID != X86::TILERegClassID && ID != X86::TILEPAIRRegClassID) {
|
||||
if (DisableRegAllocNDDHints || !ST.hasNDD() ||
|
||||
!TRI.isGeneralPurposeRegisterClass(&RC))
|
||||
return BaseImplRetVal;
|
||||
@ -1151,7 +1203,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
|
||||
return;
|
||||
}
|
||||
ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI);
|
||||
if (PhysShape == VirtShape)
|
||||
if (canHintShape(PhysShape, VirtShape))
|
||||
Hints.push_back(PhysReg);
|
||||
};
|
||||
|
||||
|
@ -30,6 +30,8 @@ let Namespace = "X86" in {
|
||||
def sub_ymm : SubRegIndex<256>;
|
||||
def sub_mask_0 : SubRegIndex<-1>;
|
||||
def sub_mask_1 : SubRegIndex<-1, -1>;
|
||||
def sub_t0 : SubRegIndex<8192>;
|
||||
def sub_t1 : SubRegIndex<8192, 8192>;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
@ -431,6 +433,10 @@ def TMM5: X86Reg<"tmm5", 5>;
|
||||
def TMM6: X86Reg<"tmm6", 6>;
|
||||
def TMM7: X86Reg<"tmm7", 7>;
|
||||
}
|
||||
// TMM register pairs
|
||||
def TPAIRS : RegisterTuples<[sub_t0, sub_t1],
|
||||
[(add TMM0, TMM2, TMM4, TMM6),
|
||||
(add TMM1, TMM3, TMM5, TMM7)]>;
|
||||
|
||||
// Floating point stack registers. These don't map one-to-one to the FP
|
||||
// pseudo registers, but we still mark them as aliasing FP registers. That
|
||||
@ -835,6 +841,9 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
|
||||
let CopyCost = -1 in // Don't allow copying of tile registers
|
||||
def TILE : RegisterClass<"X86", [x86amx], 8192,
|
||||
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
|
||||
// Need check alignment 3rd operand size=1024*2*8
|
||||
let isAllocatable = 1 in
|
||||
def TILEPAIR : RegisterClass<"X86", [untyped], 512, (add TPAIRS)> {let Size = 16384;}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Register categories.
|
||||
|
@ -76,6 +76,63 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
|
||||
INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false,
|
||||
false)
|
||||
|
||||
unsigned getAMXRegNum(MachineRegisterInfo *MRI, Register Reg) {
|
||||
if (Reg.isVirtual()) {
|
||||
unsigned RegClassID = MRI->getRegClass(Reg)->getID();
|
||||
if (RegClassID == X86::TILERegClassID)
|
||||
return 1;
|
||||
if (RegClassID == X86::TILEPAIRRegClassID)
|
||||
return 2;
|
||||
} else {
|
||||
if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
|
||||
return 1;
|
||||
if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
|
||||
return 2;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void collectVirtRegShapes(MachineRegisterInfo *MRI, VirtRegMap &VRM,
|
||||
Register VirtReg,
|
||||
SmallVector<ShapeT, 8> &Phys2Shapes) {
|
||||
unsigned Num = getAMXRegNum(MRI, VirtReg);
|
||||
MCRegister PhysReg = VRM.getPhys(VirtReg);
|
||||
if (!PhysReg)
|
||||
return;
|
||||
|
||||
if (Num == 1) {
|
||||
unsigned Index = PhysReg - X86::TMM0;
|
||||
if (!Phys2Shapes[Index].isValid()) {
|
||||
ShapeT Shape = VRM.getShape(VirtReg);
|
||||
Phys2Shapes[Index] = Shape;
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Split tile pair shape info to 2 single tile shape info. e.g:
|
||||
// Put TMM0_TMM1's Shape to TMM0's shape + TMM1's Shape in Phys2Shapes.
|
||||
if (Num == 2) {
|
||||
unsigned Index0 = (PhysReg - X86::TMM0_TMM1) * 2;
|
||||
unsigned Index1 = (PhysReg - X86::TMM0_TMM1) * 2 + 1;
|
||||
|
||||
ShapeT Shape = VRM.getShape(VirtReg);
|
||||
assert(Shape.getShapeNum() == 2 && "Unexpected shape number!");
|
||||
|
||||
if (!Phys2Shapes[Index0].isValid()) {
|
||||
ShapeT Shape0(Shape.getRow(0), Shape.getCol(0), MRI);
|
||||
Phys2Shapes[Index0] = Shape0;
|
||||
}
|
||||
|
||||
if (!Phys2Shapes[Index1].isValid()) {
|
||||
ShapeT Shape1(Shape.getRow(1), Shape.getCol(1), MRI);
|
||||
Phys2Shapes[Index1] = Shape1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static bool isAMXRegClass(MachineRegisterInfo *MRI, Register Reg) {
|
||||
return getAMXRegNum(MRI, Reg) > 0;
|
||||
}
|
||||
|
||||
bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
|
||||
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
|
||||
// Early exit in the common case of non-AMX code.
|
||||
@ -121,29 +178,24 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
|
||||
assert(ConstMI && "Cannot find an insertion point");
|
||||
|
||||
unsigned AMXRegNum = TRI->getRegClass(X86::TILERegClassID)->getNumRegs();
|
||||
SmallVector<Register, 8> Phys2Virt(AMXRegNum, 0);
|
||||
SmallVector<ShapeT, 8> Phys2Shapes(AMXRegNum, ShapeT());
|
||||
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
|
||||
Register VirtReg = Register::index2VirtReg(I);
|
||||
if (MRI.reg_nodbg_empty(VirtReg))
|
||||
continue;
|
||||
if (MRI.getRegClass(VirtReg)->getID() != X86::TILERegClassID)
|
||||
if (!isAMXRegClass(&MRI, VirtReg))
|
||||
continue;
|
||||
MCRegister PhysReg = VRM.getPhys(VirtReg);
|
||||
if (!PhysReg)
|
||||
continue;
|
||||
unsigned Index = PhysReg - X86::TMM0;
|
||||
if (!Phys2Virt[Index])
|
||||
Phys2Virt[Index] = VirtReg;
|
||||
collectVirtRegShapes(&MRI, VRM, VirtReg, Phys2Shapes);
|
||||
}
|
||||
|
||||
// Fill in the shape of each tile physical register.
|
||||
for (unsigned I = 0; I < AMXRegNum; ++I) {
|
||||
if (!Phys2Virt[I])
|
||||
ShapeT Shape = Phys2Shapes[I];
|
||||
if (!Shape.isValid())
|
||||
continue;
|
||||
DebugLoc DL;
|
||||
bool IsRow = true;
|
||||
MachineInstr *NewMI = nullptr;
|
||||
ShapeT Shape = VRM.getShape(Phys2Virt[I]);
|
||||
for (auto &R : {Shape.getRow()->getReg(), Shape.getCol()->getReg()}) {
|
||||
// Here is the data format for the tile config.
|
||||
// 0 palette
|
||||
@ -172,7 +224,15 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
|
||||
"Cannot initialize with different shapes");
|
||||
continue;
|
||||
}
|
||||
Imm = DefMI.getOperand(1).getImm();
|
||||
if (DefMI.getOperand(1).isImm()) {
|
||||
Imm = DefMI.getOperand(1).getImm();
|
||||
} else {
|
||||
assert(DefMI.getOpcode() == X86::MOV32r0 &&
|
||||
"The opcode is assumed to be MOV32r0 if the operand is not "
|
||||
"immediate.");
|
||||
Imm = 0;
|
||||
}
|
||||
|
||||
NewMI = addFrameReference(
|
||||
BuildMI(MF.front(), ++ConstMI->getIterator(), DL,
|
||||
TII->get(IsRow ? X86::MOV8mi : X86::MOV16mi)),
|
||||
|
@ -1879,6 +1879,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
|
||||
bool HasLeaf1E = MaxLevel >= 0x1e &&
|
||||
!getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX);
|
||||
Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave;
|
||||
Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave;
|
||||
|
||||
bool HasLeaf24 =
|
||||
MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX);
|
||||
|
@ -599,6 +599,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE;
|
||||
constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
|
||||
constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE;
|
||||
constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE;
|
||||
constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE;
|
||||
constexpr FeatureBitset ImpliedFeaturesHRESET = {};
|
||||
|
||||
constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {};
|
||||
|
136
llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll
Normal file
136
llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll
Normal file
@ -0,0 +1,136 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
|
||||
; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s
|
||||
|
||||
@buf = dso_local global [2048 x i8] zeroinitializer, align 16
|
||||
@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
|
||||
|
||||
define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: test_tile_2rpntlvwz0:
|
||||
; CHECK: # %bb.0: # %entry
|
||||
; CHECK-NEXT: pushq %rbp
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
||||
; CHECK-NEXT: .cfi_offset %rbp, -16
|
||||
; CHECK-NEXT: movq %rsp, %rbp
|
||||
; CHECK-NEXT: .cfi_def_cfa_register %rbp
|
||||
; CHECK-NEXT: pushq %rbx
|
||||
; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
|
||||
; CHECK-NEXT: subq $8192, %rsp # imm = 0x2000
|
||||
; CHECK-NEXT: .cfi_offset %rbx, -24
|
||||
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: # kill: def $dx killed $dx killed $edx
|
||||
; CHECK-NEXT: movw %si, %cx
|
||||
; CHECK-NEXT: movw %di, %ax
|
||||
; CHECK-NEXT: # implicit-def: $al
|
||||
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: # implicit-def: $al
|
||||
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: # implicit-def: $al
|
||||
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: # implicit-def: $cl
|
||||
; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: # implicit-def: $al
|
||||
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: # implicit-def: $al
|
||||
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: # implicit-def: $al
|
||||
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: # implicit-def: $al
|
||||
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movl $buf, %esi
|
||||
; CHECK-NEXT: movl $32, %edi
|
||||
; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdi), %tmm4
|
||||
; CHECK-NEXT: movabsq $64, %rbx
|
||||
; CHECK-NEXT: tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload
|
||||
; CHECK-NEXT: movabsq $64, %rbx
|
||||
; CHECK-NEXT: tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload
|
||||
; CHECK-NEXT: movl $64, %edi
|
||||
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; CHECK-NEXT: tilestored %tmm1, (%rsi,%rdi)
|
||||
; CHECK-NEXT: movl $64, %edi
|
||||
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
|
||||
; CHECK-NEXT: tilezero %tmm0
|
||||
; CHECK-NEXT: movl $64, %edi
|
||||
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
|
||||
; CHECK-NEXT: movl $64, %edi
|
||||
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm1
|
||||
; CHECK-NEXT: movl $64, %edi
|
||||
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm2
|
||||
; CHECK-NEXT: movl $64, %edi
|
||||
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0
|
||||
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
|
||||
; CHECK-NEXT: movl $64, %edi
|
||||
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
|
||||
; CHECK-NEXT: movl $64, %edi
|
||||
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
||||
; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0
|
||||
; CHECK-NEXT: movl $buf2, %edx
|
||||
; CHECK-NEXT: movl $32, %esi
|
||||
; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||
; CHECK-NEXT: leaq -8(%rbp), %rsp
|
||||
; CHECK-NEXT: popq %rbx
|
||||
; CHECK-NEXT: popq %rbp
|
||||
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
entry:
|
||||
%0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3
|
||||
%1 = extractvalue { x86_amx, x86_amx } %0, 0
|
||||
%2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3
|
||||
%3 = extractvalue { x86_amx, x86_amx } %0, 1
|
||||
%4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3
|
||||
%5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3
|
||||
%6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3
|
||||
%7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3
|
||||
%8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3
|
||||
%9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3
|
||||
%10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3
|
||||
%11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3
|
||||
%12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3
|
||||
ret void
|
||||
}
|
||||
|
||||
declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1
|
||||
|
||||
declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
|
||||
|
||||
declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
|
||||
|
||||
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
|
||||
|
||||
declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
|
||||
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4
|
||||
|
||||
attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" }
|
||||
attributes #1 = { argmemonly nofree nounwind readonly }
|
||||
attributes #2 = { nofree nosync nounwind readnone }
|
||||
attributes #3 = { nounwind }
|
||||
attributes #4 = { argmemonly nounwind writeonly }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
|
||||
!0 = !{i32 1, !"wchar_size", i32 4}
|
||||
!1 = !{i32 7, !"uwtable", i32 2}
|
||||
!2 = !{i32 7, !"frame-pointer", i32 2}
|
165
llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
Normal file
165
llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
Normal file
@ -0,0 +1,165 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
|
||||
# RUN: -mattr=+amx-transpose -run-pass=fasttileconfig -o - %s | FileCheck %s
|
||||
|
||||
---
|
||||
name: test_tile_2rpntlvwz0
|
||||
alignment: 16
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
failedISel: false
|
||||
tracksRegLiveness: true
|
||||
hasWinCFI: false
|
||||
callsEHReturn: false
|
||||
callsUnwindInit: false
|
||||
hasEHCatchret: false
|
||||
hasEHScopes: false
|
||||
hasEHFunclets: false
|
||||
failsVerification: false
|
||||
tracksDebugUserValues: false
|
||||
registers: []
|
||||
liveins:
|
||||
- { reg: '$edi', virtual-reg: '' }
|
||||
- { reg: '$esi', virtual-reg: '' }
|
||||
- { reg: '$edx', virtual-reg: '' }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 1024
|
||||
adjustsStack: false
|
||||
hasCalls: true
|
||||
stackProtector: ''
|
||||
functionContext: ''
|
||||
maxCallFrameSize: 4294967295
|
||||
cvBytesOfCalleeSavedRegisters: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
hasTailCall: false
|
||||
localFrameSize: 0
|
||||
savePoint: ''
|
||||
restorePoint: ''
|
||||
fixedStack: []
|
||||
stack:
|
||||
- { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 1, name: '', type: default, offset: 0, size: 8, alignment: 8,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 3, name: '', type: default, offset: 0, size: 8, alignment: 8,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 4,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 5, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 6, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 7, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
callSites: []
|
||||
debugValueSubstitutions: []
|
||||
constants: []
|
||||
machineFunctionInfo:
|
||||
amxProgModel: ManagedRA
|
||||
body: |
|
||||
bb.0.entry:
|
||||
liveins: $rdi, $rsi, $rdx, $rax
|
||||
|
||||
; CHECK-LABEL: name: test_tile_2rpntlvwz0
|
||||
; CHECK: liveins: $rdi, $rsi, $rdx, $rax
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0
|
||||
; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4)
|
||||
; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
|
||||
; CHECK-NEXT: renamable $rcx = MOV32ri64 64
|
||||
; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7)
|
||||
; CHECK-NEXT: renamable $cx = MOV16ri 64
|
||||
; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5)
|
||||
; CHECK-NEXT: renamable $cx = MOV16ri 16
|
||||
; CHECK-NEXT: renamable $r8w = MOV16ri 16
|
||||
; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6)
|
||||
; CHECK-NEXT: $al = IMPLICIT_DEF
|
||||
; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
|
||||
; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
|
||||
; CHECK-NEXT: $al = IMPLICIT_DEF
|
||||
; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 50, $noreg, $al :: (store (s512) into %stack.4 + 50, align 2, basealign 4)
|
||||
; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 20, $noreg, $cx :: (store (s512) into %stack.4 + 20, align 4)
|
||||
; CHECK-NEXT: $al = IMPLICIT_DEF
|
||||
; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 49, $noreg, $al :: (store (s512) into %stack.4 + 49, align 1, basealign 4)
|
||||
; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 18, $noreg, $di :: (store (s512) into %stack.4 + 18, align 2, basealign 4)
|
||||
; CHECK-NEXT: $al = IMPLICIT_DEF
|
||||
; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
|
||||
; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
|
||||
; CHECK-NEXT: $al = IMPLICIT_DEF
|
||||
; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
|
||||
; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
|
||||
; CHECK-NEXT: $al = IMPLICIT_DEF
|
||||
; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 52, $noreg, $al :: (store (s512) into %stack.4 + 52, align 4)
|
||||
; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 24, $noreg, $cx :: (store (s512) into %stack.4 + 24, align 4)
|
||||
; CHECK-NEXT: $al = IMPLICIT_DEF
|
||||
; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 53, $noreg, $al :: (store (s512) into %stack.4 + 53, align 1, basealign 4)
|
||||
; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 26, $noreg, $di :: (store (s512) into %stack.4 + 26, align 2, basealign 4)
|
||||
; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4)
|
||||
; CHECK-NEXT: renamable $r9 = COPY $rsi
|
||||
; CHECK-NEXT: $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
|
||||
; CHECK-NEXT: renamable $r8 = COPY $rdi
|
||||
; CHECK-NEXT: $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
|
||||
; CHECK-NEXT: renamable $r10 = COPY $rax
|
||||
; CHECK-NEXT: $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5)
|
||||
; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg
|
||||
; CHECK-NEXT: renamable $tmm0 = COPY renamable $tmm5
|
||||
; CHECK-NEXT: renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
|
||||
; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1
|
||||
; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
|
||||
; CHECK-NEXT: renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx
|
||||
; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
|
||||
; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
|
||||
; CHECK-NEXT: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg
|
||||
; CHECK-NEXT: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg
|
||||
; CHECK-NEXT: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
|
||||
; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
|
||||
renamable $zmm0 = AVX512_512_SET0
|
||||
VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4)
|
||||
MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
|
||||
renamable $rcx = MOV32ri64 64
|
||||
MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7)
|
||||
renamable $cx = MOV16ri 64
|
||||
MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5)
|
||||
renamable $cx = MOV16ri 16
|
||||
renamable $r8w = MOV16ri 16
|
||||
MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6)
|
||||
PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4)
|
||||
renamable $r9 = COPY $rsi
|
||||
$rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
|
||||
renamable $r8 = COPY $rdi
|
||||
$di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
|
||||
renamable $r10 = COPY $rax
|
||||
$ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5)
|
||||
renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg
|
||||
renamable $tmm0 = COPY renamable $tmm5
|
||||
renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
|
||||
PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1
|
||||
PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
|
||||
renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx
|
||||
PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
|
||||
renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
|
||||
renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg
|
||||
renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg
|
||||
renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
|
||||
PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
|
||||
...
|
153
llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
Normal file
153
llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
Normal file
@ -0,0 +1,153 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
|
||||
# RUN: -mattr=+amx-transpose -run-pass=greedy,tileconfig -o - %s | FileCheck %s
|
||||
|
||||
--- |
|
||||
@buf = dso_local global [2048 x i8] zeroinitializer, align 16
|
||||
@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
|
||||
|
||||
define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
|
||||
entry:
|
||||
%0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf, i64 0, i64 0), i64 32) #5
|
||||
%1 = extractvalue { x86_amx, x86_amx } %0, 0
|
||||
%2 = extractvalue { x86_amx, x86_amx } %0, 1
|
||||
%3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5
|
||||
%4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %3, x86_amx %1, x86_amx %2) #5
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf2, i64 0, i64 0), i64 32, x86_amx %4) #5
|
||||
ret void
|
||||
}
|
||||
|
||||
declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) #1
|
||||
|
||||
declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
|
||||
|
||||
declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
|
||||
|
||||
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
|
||||
|
||||
declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
|
||||
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #4
|
||||
|
||||
attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
|
||||
attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
|
||||
attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
|
||||
attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
|
||||
attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
|
||||
attributes #5 = { nounwind }
|
||||
|
||||
...
|
||||
---
|
||||
name: test_tile_2rpntlvwz0
|
||||
alignment: 16
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
failedISel: false
|
||||
tracksRegLiveness: true
|
||||
hasWinCFI: false
|
||||
callsEHReturn: false
|
||||
callsUnwindInit: false
|
||||
hasEHCatchret: false
|
||||
hasEHScopes: false
|
||||
hasEHFunclets: false
|
||||
failsVerification: false
|
||||
tracksDebugUserValues: false
|
||||
registers:
|
||||
- { id: 0, class: gr32, preferred-register: '' }
|
||||
- { id: 1, class: gr32, preferred-register: '' }
|
||||
- { id: 2, class: gr32, preferred-register: '' }
|
||||
- { id: 3, class: gr16, preferred-register: '' }
|
||||
- { id: 4, class: gr16, preferred-register: '' }
|
||||
- { id: 5, class: gr16, preferred-register: '' }
|
||||
- { id: 6, class: gr64, preferred-register: '' }
|
||||
- { id: 7, class: gr64_nosp, preferred-register: '' }
|
||||
- { id: 8, class: tilepair, preferred-register: '' }
|
||||
- { id: 9, class: tile, preferred-register: '' }
|
||||
- { id: 10, class: tile, preferred-register: '' }
|
||||
- { id: 11, class: tile, preferred-register: '' }
|
||||
- { id: 12, class: tile, preferred-register: '' }
|
||||
- { id: 13, class: gr64, preferred-register: '' }
|
||||
- { id: 14, class: vr512, preferred-register: '' }
|
||||
liveins:
|
||||
- { reg: '$edi', virtual-reg: '%0' }
|
||||
- { reg: '$esi', virtual-reg: '%1' }
|
||||
- { reg: '$edx', virtual-reg: '%2' }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 4
|
||||
adjustsStack: false
|
||||
hasCalls: false
|
||||
stackProtector: ''
|
||||
functionContext: ''
|
||||
maxCallFrameSize: 4294967295
|
||||
cvBytesOfCalleeSavedRegisters: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
hasTailCall: false
|
||||
localFrameSize: 0
|
||||
savePoint: ''
|
||||
restorePoint: ''
|
||||
fixedStack: []
|
||||
stack:
|
||||
- { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 4,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
callSites: []
|
||||
debugValueSubstitutions: []
|
||||
constants: []
|
||||
machineFunctionInfo:
|
||||
amxProgModel: ManagedRA
|
||||
body: |
|
||||
bb.0.entry:
|
||||
liveins: $edi, $esi, $edx
|
||||
|
||||
|
||||
; CHECK-LABEL: name: test_tile_2rpntlvwz0
|
||||
; CHECK: liveins: $edi, $esi, $edx
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi
|
||||
; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
|
||||
; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4)
|
||||
; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
|
||||
; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 26, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 26, align 2, basealign 4)
|
||||
; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 53, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 53, align 1, basealign 4)
|
||||
; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 24, $noreg, [[COPY1]].sub_16bit :: (store (s512) into %stack.0 + 24, align 4)
|
||||
; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 52, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 52, align 4)
|
||||
; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 16, align 4)
|
||||
; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 48, align 4)
|
||||
; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
|
||||
; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 @buf
|
||||
; CHECK-NEXT: [[MOV32ri64_1:%[0-9]+]]:gr64_nosp = MOV32ri64 32
|
||||
; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[COPY]].sub_16bit, [[MOV32ri64_]], 1, [[MOV32ri64_1]], 0, $noreg
|
||||
; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit
|
||||
; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY2]].sub_16bit, [[COPY]].sub_16bit, [[COPY1]].sub_16bit, [[PTILEZEROV]], [[PT2RPNTLVWZ0V]].sub_t0, [[PT2RPNTLVWZ0V]].sub_t1
|
||||
; CHECK-NEXT: [[MOV32ri64_2:%[0-9]+]]:gr64 = MOV32ri64 @buf2
|
||||
; CHECK-NEXT: PTILESTOREDV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[MOV32ri64_2]], 1, [[MOV32ri64_1]], 0, $noreg, [[PTILEZEROV]]
|
||||
; CHECK-NEXT: RET 0
|
||||
%2:gr32 = COPY $edx
|
||||
%1:gr32 = COPY $esi
|
||||
%0:gr32 = COPY $edi
|
||||
%14:vr512 = AVX512_512_SET0
|
||||
VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, %14 :: (store (s512) into %stack.0, align 4)
|
||||
MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
|
||||
PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
|
||||
%6:gr64 = MOV32ri64 @buf
|
||||
%7:gr64_nosp = MOV32ri64 32
|
||||
%8:tilepair = PT2RPNTLVWZ0V %0.sub_16bit, %1.sub_16bit, %2.sub_16bit, %6, 1, %7, 0, $noreg
|
||||
%12:tile = PTILEZEROV %0.sub_16bit, %1.sub_16bit
|
||||
%12:tile = PTDPBSSDV %0.sub_16bit, %2.sub_16bit, %1.sub_16bit, %12, %8.sub_t0, %8.sub_t1
|
||||
%13:gr64 = MOV32ri64 @buf2
|
||||
PTILESTOREDV %0.sub_16bit, %1.sub_16bit, %13, 1, %7, 0, $noreg, %12
|
||||
RET 0
|
||||
|
||||
...
|
97
llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
Normal file
97
llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
Normal file
@ -0,0 +1,97 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
|
||||
# RUN: -mattr=+amx-transpose -run-pass=lowertilecopy -o - %s | FileCheck %s
|
||||
|
||||
---
|
||||
name: test_tile_2rpntlvwz0
|
||||
alignment: 16
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
failedISel: false
|
||||
tracksRegLiveness: true
|
||||
hasWinCFI: false
|
||||
callsEHReturn: false
|
||||
callsUnwindInit: false
|
||||
hasEHCatchret: false
|
||||
hasEHScopes: false
|
||||
hasEHFunclets: false
|
||||
failsVerification: false
|
||||
tracksDebugUserValues: false
|
||||
registers: []
|
||||
liveins:
|
||||
- { reg: '$edi', virtual-reg: '' }
|
||||
- { reg: '$esi', virtual-reg: '' }
|
||||
- { reg: '$edx', virtual-reg: '' }
|
||||
- { reg: '$cx', virtual-reg: '' }
|
||||
- { reg: '$r9', virtual-reg: '' }
|
||||
- { reg: '$r10', virtual-reg: '' }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 1024
|
||||
adjustsStack: false
|
||||
hasCalls: true
|
||||
stackProtector: ''
|
||||
functionContext: ''
|
||||
maxCallFrameSize: 4294967295
|
||||
cvBytesOfCalleeSavedRegisters: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
hasTailCall: false
|
||||
localFrameSize: 0
|
||||
savePoint: ''
|
||||
restorePoint: ''
|
||||
fixedStack: []
|
||||
stack:
|
||||
- { id: 43, name: '', type: default, offset: 0, size: 64, alignment: 4,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 68, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
callSites: []
|
||||
debugValueSubstitutions: []
|
||||
constants: []
|
||||
machineFunctionInfo:
|
||||
amxProgModel: ManagedRA
|
||||
body: |
|
||||
bb.0.entry:
|
||||
liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9
|
||||
|
||||
|
||||
; CHECK-LABEL: name: test_tile_2rpntlvwz0
|
||||
; CHECK: liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
|
||||
; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg
|
||||
; CHECK-NEXT: $rax = MOV64ri 64
|
||||
; CHECK-NEXT: TILESTORED %stack.3, 1, $rax, 0, $noreg, $tmm5 :: (store (s8192) into %stack.3)
|
||||
; CHECK-NEXT: $tmm0 = TILELOADD %stack.3, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.3)
|
||||
; CHECK-NEXT: $rax = MOV64ri 64
|
||||
; CHECK-NEXT: TILESTORED %stack.2, 1, $rax, 0, $noreg, $tmm4 :: (store (s8192) into %stack.2)
|
||||
; CHECK-NEXT: $tmm1 = TILELOADD %stack.2, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.2)
|
||||
; CHECK-NEXT: renamable $r8 = MOV32ri64 64
|
||||
; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.1)
|
||||
; CHECK-NEXT: renamable $di = MOV16ri 64
|
||||
; CHECK-NEXT: renamable $cx = MOV16ri 16
|
||||
; CHECK-NEXT: PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1
|
||||
; CHECK-NEXT: PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
|
||||
PLDTILECFGV %stack.43, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.43, align 4)
|
||||
renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg
|
||||
renamable $tmm0 = COPY renamable $tmm5
|
||||
renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
|
||||
renamable $r8 = MOV32ri64 64
|
||||
MOV64mr %stack.68, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.68)
|
||||
renamable $di = MOV16ri 64
|
||||
renamable $cx = MOV16ri 16
|
||||
PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1
|
||||
PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
|
||||
|
||||
...
|
86
llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
Normal file
86
llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
Normal file
@ -0,0 +1,86 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
|
||||
|
||||
@buf = dso_local global [2048 x i8] zeroinitializer, align 16
|
||||
|
||||
; Function Attrs: noinline nounwind optnone uwtable
|
||||
define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1, ptr %m) #0 {
|
||||
; CHECK-LABEL: @test_tile_2rpntlvwz0(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = udiv i16 [[COL1:%.*]], 4
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 0
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[COL0]] to i64
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M:%.*]], i64 [[TMP3]], x86_amx [[TMP2]])
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 1
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[COL1]] to i64
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP6]], x86_amx [[TMP5]])
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = sext i16 [[COL0]] to i64
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP9]], x86_amx [[TMP8]])
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[COL0]] to i64
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP11]])
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[COL1]] to i64
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP14]])
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[COL0]] to i64
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP0]], i16 [[COL0]], ptr [[M]], i64 [[TMP17]])
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL0]], i16 [[COL1]], x86_amx [[TMP13]], x86_amx [[TMP16]], x86_amx [[TMP19]]) #[[ATTR3]]
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[COL0]] to i64
|
||||
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP21]], x86_amx [[TMP20]])
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
|
||||
%0 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr getelementptr inbounds ([2048 x i8], ptr @buf, i64 0, i64 0), i64 32) #7
|
||||
%1 = extractvalue { x86_amx, x86_amx } %0, 0
|
||||
%2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #7
|
||||
store <256 x i32> %2, ptr %m, align 1024
|
||||
|
||||
%3 = extractvalue { x86_amx, x86_amx } %0, 1
|
||||
%4 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #7
|
||||
store <256 x i32> %4, ptr %m, align 1024
|
||||
|
||||
%5 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #7
|
||||
%6 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #7
|
||||
store <256 x i32> %6, ptr %m, align 64
|
||||
|
||||
%7 = load <256 x i32>, ptr %m, align 64
|
||||
%8 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %7) #7
|
||||
%9 = load <256 x i32>, ptr %m, align 64
|
||||
%10 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %9) #7
|
||||
%11 = load <256 x i32>, ptr %m, align 64
|
||||
%12 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #7
|
||||
|
||||
%13 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col0, i16 %col1, x86_amx %8, x86_amx %10, x86_amx %12) #7
|
||||
%14 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %13) #7
|
||||
store <256 x i32> %14, ptr %m, align 64
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #3
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #4
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #4
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #3
|
||||
|
||||
; Function Attrs: argmemonly nounwind writeonly
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #5
|
||||
|
||||
attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
|
||||
attributes #1 = { argmemonly nofree nounwind willreturn writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
|
||||
attributes #2 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
|
||||
attributes #3 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
|
||||
attributes #4 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
|
||||
attributes #5 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
|
||||
attributes #6 = { argmemonly nofree nounwind willreturn "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
|
||||
attributes #7 = { nounwind }
|
60
llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
Normal file
60
llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
Normal file
@ -0,0 +1,60 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
|
||||
|
||||
@buf = dso_local global [2048 x i8] zeroinitializer, align 16
|
||||
@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: @test_tile_2rpntlvwz0(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1:%.*]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL1]], i16 [[COL0]], x86_amx [[TMP3]], x86_amx [[TMP1]], x86_amx [[TMP2]]) #[[ATTR3]]
|
||||
; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr @buf2, i64 32, x86_amx [[TMP4]]) #[[ATTR3]]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #5
|
||||
%1 = extractvalue { x86_amx, x86_amx } %0, 0
|
||||
%2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #5
|
||||
%3 = extractvalue { x86_amx, x86_amx } %0, 1
|
||||
%4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #5
|
||||
%5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5
|
||||
%6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #5
|
||||
%7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #5
|
||||
%8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #5
|
||||
%9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #5
|
||||
%10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #5
|
||||
%11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #5
|
||||
%12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #5
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #5
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
|
||||
|
||||
; Function Attrs: argmemonly nounwind writeonly
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4
|
||||
|
||||
attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
|
||||
attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
|
||||
attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
|
||||
attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
|
||||
attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
|
||||
attributes #5 = { nounwind }
|
134
llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
Normal file
134
llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
Normal file
@ -0,0 +1,134 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
|
||||
# RUN: -mattr=+amx-transpose -run-pass=fastpretileconfig -o - %s | FileCheck %s
|
||||
|
||||
---
|
||||
name: test_tile_2rpntlvwz0
|
||||
alignment: 16
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
failedISel: false
|
||||
tracksRegLiveness: true
|
||||
hasWinCFI: false
|
||||
callsEHReturn: false
|
||||
callsUnwindInit: false
|
||||
hasEHCatchret: false
|
||||
hasEHScopes: false
|
||||
hasEHFunclets: false
|
||||
failsVerification: false
|
||||
tracksDebugUserValues: false
|
||||
registers:
|
||||
- { id: 0, class: gr64_nosp, preferred-register: '' }
|
||||
- { id: 1, class: gr16, preferred-register: '' }
|
||||
- { id: 2, class: gr16, preferred-register: '' }
|
||||
- { id: 3, class: gr16, preferred-register: '' }
|
||||
- { id: 4, class: gr64, preferred-register: '' }
|
||||
- { id: 5, class: gr64, preferred-register: '' }
|
||||
- { id: 6, class: gr64, preferred-register: '' }
|
||||
- { id: 7, class: gr64_nosp, preferred-register: '' }
|
||||
- { id: 8, class: tilepair, preferred-register: '' }
|
||||
- { id: 9, class: tile, preferred-register: '' }
|
||||
- { id: 10, class: tile, preferred-register: '' }
|
||||
- { id: 11, class: tile, preferred-register: '' }
|
||||
- { id: 181, class: tile, preferred-register: '' }
|
||||
- { id: 183, class: tile, preferred-register: '' }
|
||||
- { id: 185, class: tile, preferred-register: '' }
|
||||
- { id: 186, class: tile, preferred-register: '' }
|
||||
liveins:
|
||||
- { reg: '$edi', virtual-reg: '%0' }
|
||||
- { reg: '$esi', virtual-reg: '%1' }
|
||||
- { reg: '$edx', virtual-reg: '%2' }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 1024
|
||||
adjustsStack: false
|
||||
hasCalls: true
|
||||
stackProtector: ''
|
||||
functionContext: ''
|
||||
maxCallFrameSize: 4294967295
|
||||
cvBytesOfCalleeSavedRegisters: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
hasTailCall: false
|
||||
localFrameSize: 0
|
||||
savePoint: ''
|
||||
restorePoint: ''
|
||||
fixedStack: []
|
||||
stack:
|
||||
- { id: 18, name: '', type: default, offset: 0, size: 8, alignment: 8,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 19, name: '', type: default, offset: 0, size: 8, alignment: 8,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 20, name: '', type: default, offset: 0, size: 8, alignment: 8,
|
||||
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
- { id: 21, name: '', type: default, offset: 0, size: 8,
|
||||
alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true,
|
||||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
||||
callSites: []
|
||||
debugValueSubstitutions: []
|
||||
constants: []
|
||||
machineFunctionInfo:
|
||||
amxProgModel: ManagedRA
|
||||
body: |
|
||||
bb.0.entry:
|
||||
liveins: $rdi, $rsi, $rdx, $rax
|
||||
|
||||
; CHECK-LABEL: name: test_tile_2rpntlvwz0
|
||||
; CHECK: liveins: $rdi, $rsi, $rdx, $rax
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
|
||||
; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4)
|
||||
; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
|
||||
; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 64
|
||||
; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 64
|
||||
; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 16
|
||||
; CHECK-NEXT: [[MOV16ri2:%[0-9]+]]:gr16 = MOV16ri 16
|
||||
; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.4, align 4)
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rdx
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY $rax
|
||||
; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[COPY2]], 1, killed [[COPY3]], 0, $noreg
|
||||
; CHECK-NEXT: [[COPY4:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1
|
||||
; CHECK-NEXT: [[COPY5:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0
|
||||
; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY5]]
|
||||
; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY4]]
|
||||
; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri]], [[MOV16ri1]]
|
||||
; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTILEZEROV]]
|
||||
; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg
|
||||
; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg
|
||||
; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg
|
||||
; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[PTILELOADDV]], killed [[PTILELOADDV1]], killed [[PTILELOADDV2]]
|
||||
; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri]], killed [[MOV16ri1]], killed [[COPY2]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]]
|
||||
%0:gr64_nosp = MOV32ri64 64
|
||||
%1:gr16 = MOV16ri 64
|
||||
%2:gr16 = MOV16ri 16
|
||||
%3:gr16 = MOV16ri 16
|
||||
%4:gr64 = COPY $rsi
|
||||
%5:gr64 = COPY $rdi
|
||||
%6:gr64 = COPY $rdx
|
||||
%7:gr64_nosp = COPY $rax
|
||||
%8:tilepair = PT2RPNTLVWZ0V %1, %2, %3, %6, 1, killed %7, 0, $noreg
|
||||
%9:tile = COPY %8.sub_t1
|
||||
%10:tile = COPY %8.sub_t0
|
||||
PTILESTOREDV %1, %2, %4, 1, %0, 0, $noreg, killed %10
|
||||
PTILESTOREDV %1, %3, %5, 1, %0, 0, $noreg, killed %9
|
||||
%11:tile = PTILEZEROV %1, %2
|
||||
PTILESTOREDV %1, %2, %6, 1, %0, 0, $noreg, killed %11
|
||||
%181:tile = PTILELOADDV %1, %2, %4, 1, %0, 0, $noreg
|
||||
%183:tile = PTILELOADDV %1, %3, %5, 1, %0, 0, $noreg
|
||||
%185:tile = PTILELOADDV %1, %2, %6, 1, %0, 0, $noreg
|
||||
%186:tile = PTDPBSSDV %1, %2, %3, %181, killed %183, killed %185
|
||||
PTILESTOREDV killed %1, killed %2, killed %6, 1, killed %0, 0, $noreg, killed %186
|
||||
...
|
113
llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
Normal file
113
llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
Normal file
@ -0,0 +1,113 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
|
||||
# RUN: -mattr=+amx-transpose -run-pass=tilepreconfig -o - %s | FileCheck %s
|
||||
|
||||
---
|
||||
name: test_tile_2rpntlvwz0
|
||||
alignment: 16
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
failedISel: false
|
||||
tracksRegLiveness: true
|
||||
hasWinCFI: false
|
||||
callsEHReturn: false
|
||||
callsUnwindInit: false
|
||||
hasEHCatchret: false
|
||||
hasEHScopes: false
|
||||
hasEHFunclets: false
|
||||
failsVerification: false
|
||||
tracksDebugUserValues: false
|
||||
registers:
|
||||
- { id: 0, class: gr32, preferred-register: '' }
|
||||
- { id: 1, class: gr32, preferred-register: '' }
|
||||
- { id: 2, class: gr32, preferred-register: '' }
|
||||
- { id: 3, class: gr16, preferred-register: '' }
|
||||
- { id: 4, class: gr16, preferred-register: '' }
|
||||
- { id: 5, class: gr16, preferred-register: '' }
|
||||
- { id: 6, class: gr64, preferred-register: '' }
|
||||
- { id: 7, class: gr64_nosp, preferred-register: '' }
|
||||
- { id: 8, class: tilepair, preferred-register: '' }
|
||||
- { id: 9, class: tile, preferred-register: '' }
|
||||
- { id: 10, class: tile, preferred-register: '' }
|
||||
- { id: 11, class: tile, preferred-register: '' }
|
||||
- { id: 12, class: tile, preferred-register: '' }
|
||||
- { id: 13, class: gr64, preferred-register: '' }
|
||||
liveins:
|
||||
- { reg: '$edi', virtual-reg: '%0' }
|
||||
- { reg: '$esi', virtual-reg: '%1' }
|
||||
- { reg: '$edx', virtual-reg: '%2' }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 1
|
||||
adjustsStack: false
|
||||
hasCalls: false
|
||||
stackProtector: ''
|
||||
functionContext: ''
|
||||
maxCallFrameSize: 4294967295
|
||||
cvBytesOfCalleeSavedRegisters: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
hasTailCall: false
|
||||
localFrameSize: 0
|
||||
savePoint: ''
|
||||
restorePoint: ''
|
||||
fixedStack: []
|
||||
stack: []
|
||||
callSites: []
|
||||
debugValueSubstitutions: []
|
||||
constants: []
|
||||
machineFunctionInfo:
|
||||
amxProgModel: ManagedRA
|
||||
body: |
|
||||
bb.0.entry:
|
||||
liveins: $edi, $esi, $edx, $rax, $rbx
|
||||
|
||||
; CHECK-LABEL: name: test_tile_2rpntlvwz0
|
||||
; CHECK: liveins: $edi, $esi, $edx, $rax, $rbx
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
|
||||
; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4)
|
||||
; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
|
||||
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr16 = COPY [[COPY1]].sub_16bit
|
||||
; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr16 = COPY [[COPY2]].sub_16bit
|
||||
; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.0, align 4)
|
||||
; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY $rax
|
||||
; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32
|
||||
; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY5]], [[COPY4]], [[COPY3]], killed [[COPY6]], 1, [[MOV32ri64_]], 0, $noreg
|
||||
; CHECK-NEXT: [[COPY7:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1
|
||||
; CHECK-NEXT: [[COPY8:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0
|
||||
; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY5]], [[COPY4]]
|
||||
; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY5]], [[COPY3]], [[COPY4]], [[PTILEZEROV]], killed [[COPY8]], killed [[COPY7]]
|
||||
; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY $rbx
|
||||
; CHECK-NEXT: PTILESTOREDV [[COPY5]], [[COPY4]], killed [[COPY9]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]]
|
||||
; CHECK-NEXT: RET 0
|
||||
%2:gr32 = COPY $edx
|
||||
%1:gr32 = COPY $esi
|
||||
%0:gr32 = COPY $edi
|
||||
%3:gr16 = COPY %2.sub_16bit
|
||||
%4:gr16 = COPY %1.sub_16bit
|
||||
%5:gr16 = COPY %0.sub_16bit
|
||||
%6:gr64 = COPY $rax
|
||||
%7:gr64_nosp = MOV32ri64 32
|
||||
%8:tilepair = PT2RPNTLVWZ0V %5, %4, %3, killed %6, 1, %7, 0, $noreg
|
||||
%9:tile = COPY %8.sub_t1
|
||||
%10:tile = COPY %8.sub_t0
|
||||
%11:tile = PTILEZEROV %5, %4
|
||||
%12:tile = PTDPBSSDV %5, %3, %4, %11, killed %10, killed %9
|
||||
%13:gr64 = COPY $rbx
|
||||
PTILESTOREDV %5, %4, killed %13, 1, %7, 0, $noreg, killed %12
|
||||
RET 0
|
||||
|
||||
...
|
150
llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
Normal file
150
llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
Normal file
@ -0,0 +1,150 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-tile,+amx-bf16,+amx-int8,+amx-transpose | FileCheck %s
|
||||
|
||||
define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 {
|
||||
; CHECK-LABEL: test_amx:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0
|
||||
; CHECK-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2
|
||||
; CHECK-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0
|
||||
; CHECK-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2
|
||||
; CHECK-NEXT: ttransposed %tmm3, %tmm1
|
||||
; CHECK-NEXT: retq
|
||||
call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride)
|
||||
call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride)
|
||||
call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride)
|
||||
call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride)
|
||||
call void @llvm.x86.ttransposed(i8 1, i8 3)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.x86.t2rpntlvwz0(i8 %tile1, i8* %addr1, i64 %stride)
|
||||
declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride)
|
||||
declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride)
|
||||
declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride)
|
||||
declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1)
|
||||
|
||||
define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 {
|
||||
; CHECK-LABEL: test_amx3:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $0, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: movw $8, %cx
|
||||
; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4
|
||||
; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4
|
||||
; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4
|
||||
; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4
|
||||
; CHECK-NEXT: ttransposed %tmm4, %tmm0
|
||||
; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx)
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
%1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
|
||||
%2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
|
||||
%3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
|
||||
%4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
|
||||
%5 = extractvalue { x86_amx, x86_amx } %4, 0
|
||||
%6 = call x86_amx @llvm.x86.ttransposed.internal(i16 8, i16 8, x86_amx %5)
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %6)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 {
|
||||
; CHECK-LABEL: test_amx_spill:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: subq $6088, %rsp # imm = 0x17C8
|
||||
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, %ax
|
||||
; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0
|
||||
; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4
|
||||
; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6
|
||||
; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6
|
||||
; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6
|
||||
; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6
|
||||
; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
|
||||
; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
|
||||
; CHECK-NEXT: movabsq $64, %rcx
|
||||
; CHECK-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
|
||||
; CHECK-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
|
||||
; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
|
||||
; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
|
||||
; CHECK-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
|
||||
; CHECK-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
|
||||
; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
|
||||
; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
|
||||
; CHECK-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
|
||||
; CHECK-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
|
||||
; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
|
||||
; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
|
||||
; CHECK-NEXT: tilestored %tmm6, (%rsi,%rdx)
|
||||
; CHECK-NEXT: tilestored %tmm7, (%rsi,%rdx)
|
||||
; CHECK-NEXT: addq $6088, %rsp # imm = 0x17C8
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
%a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
|
||||
%b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
|
||||
%b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
|
||||
%b3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
|
||||
%b4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
|
||||
%b5 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
|
||||
%e11 = extractvalue { x86_amx, x86_amx } %b1, 0
|
||||
%e12 = extractvalue { x86_amx, x86_amx } %b1, 1
|
||||
%e21 = extractvalue { x86_amx, x86_amx } %b2, 0
|
||||
%e22 = extractvalue { x86_amx, x86_amx } %b2, 1
|
||||
%e31 = extractvalue { x86_amx, x86_amx } %b3, 0
|
||||
%e32 = extractvalue { x86_amx, x86_amx } %b3, 1
|
||||
%e41 = extractvalue { x86_amx, x86_amx } %b4, 0
|
||||
%e42 = extractvalue { x86_amx, x86_amx } %b4, 1
|
||||
%e51 = extractvalue { x86_amx, x86_amx } %b5, 0
|
||||
%e52 = extractvalue { x86_amx, x86_amx } %b5, 1
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e11)
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e12)
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e21)
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e22)
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e31)
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e32)
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e41)
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e42)
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e51)
|
||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e52)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
|
||||
declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64)
|
||||
declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*, i64)
|
||||
declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64)
|
||||
declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64)
|
||||
declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx)
|
||||
|
||||
attributes #0 = { nounwind }
|
@ -3,7 +3,7 @@
|
||||
target triple = "x86_64-unknown-unknown"
|
||||
declare void @bar1()
|
||||
define preserve_allcc void @foo()#0 {
|
||||
; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
|
||||
; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
|
||||
call void @bar1()
|
||||
call void @bar2()
|
||||
ret void
|
||||
@ -11,7 +11,7 @@ define preserve_allcc void @foo()#0 {
|
||||
declare void @bar2()
|
||||
|
||||
define preserve_nonecc void @foo2()#0 {
|
||||
; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
|
||||
; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
|
||||
call void @bar1()
|
||||
call void @bar2()
|
||||
ret void
|
||||
|
57
llvm/test/MC/Disassembler/X86/amx-transpose-att.s
Normal file
57
llvm/test/MC/Disassembler/X86/amx-transpose-att.s
Normal file
@ -0,0 +1,57 @@
|
||||
// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
|
||||
|
||||
// CHECK: t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4
|
||||
// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10]
|
||||
t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4
|
||||
|
||||
// CHECK: t2rpntlvwz0 291(%r8,%rax,4), %tmm2
|
||||
// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
|
||||
t2rpntlvwz0 291(%r8,%rax,4), %tmm2
|
||||
|
||||
// CHECK: t2rpntlvwz0 -32(,%rbp,2), %tmm2
|
||||
// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
|
||||
t2rpntlvwz0 -32(,%rbp,2), %tmm2
|
||||
|
||||
// CHECK: t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm4
|
||||
// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10]
|
||||
t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm5
|
||||
|
||||
// CHECK: t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2
|
||||
// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
|
||||
t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2
|
||||
|
||||
// CHECK: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
|
||||
// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
|
||||
t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
|
||||
|
||||
// CHECK: t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm4
|
||||
// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10]
|
||||
t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm5
|
||||
|
||||
// CHECK: t2rpntlvwz1 291(%r8,%rax,4), %tmm2
|
||||
// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
|
||||
t2rpntlvwz1 291(%r8,%rax,4), %tmm2
|
||||
|
||||
// CHECK: t2rpntlvwz1 -32(,%rbp,2), %tmm2
|
||||
// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
|
||||
t2rpntlvwz1 -32(,%rbp,2), %tmm2
|
||||
|
||||
// CHECK: t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm2
|
||||
// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0x94,0xf5,0x00,0x00,0x00,0x10]
|
||||
t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm3
|
||||
|
||||
// CHECK: t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2
|
||||
// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
|
||||
t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2
|
||||
|
||||
// CHECK: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
|
||||
// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
|
||||
t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
|
||||
|
||||
// CHECK: ttransposed %tmm1, %tmm5
|
||||
// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9]
|
||||
ttransposed %tmm1, %tmm5
|
||||
|
||||
// CHECK: ttransposed %tmm2, %tmm3
|
||||
// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda]
|
||||
ttransposed %tmm2, %tmm3
|
58
llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
Normal file
58
llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
Normal file
@ -0,0 +1,58 @@
|
||||
# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
|
||||
# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
|
||||
|
||||
# ATT: t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4
|
||||
# INTEL: t2rpntlvwz0 tmm4, [rbp + 8*r14 + 268435456]
|
||||
0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10
|
||||
|
||||
# ATT: t2rpntlvwz0 291(%r8,%rax,4), %tmm2
|
||||
# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*rax + 291]
|
||||
0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00
|
||||
|
||||
# ATT: t2rpntlvwz0 -32(,%rbp,2), %tmm2
|
||||
# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32]
|
||||
0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
|
||||
|
||||
# ATT: t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm4
|
||||
# INTEL: t2rpntlvwz0t1 tmm4, [rbp + 8*r14 + 268435456]
|
||||
0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10
|
||||
|
||||
# ATT: t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2
|
||||
# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291]
|
||||
0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00
|
||||
|
||||
# ATT: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
|
||||
# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32]
|
||||
0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
|
||||
|
||||
# ATT: t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm4
|
||||
# INTEL: t2rpntlvwz1 tmm4, [rbp + 8*r14 + 268435456]
|
||||
0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10
|
||||
|
||||
# ATT: t2rpntlvwz1 291(%r8,%rax,4), %tmm2
|
||||
# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*rax + 291]
|
||||
0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00
|
||||
|
||||
# ATT: t2rpntlvwz1 -32(,%rbp,2), %tmm2
|
||||
# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32]
|
||||
0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
|
||||
|
||||
# ATT: t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm4
|
||||
# INTEL: t2rpntlvwz1t1 tmm4, [rbp + 8*r14 + 268435456]
|
||||
0xc4,0xa2,0x79,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10
|
||||
|
||||
# ATT: t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2
|
||||
# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291]
|
||||
0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00
|
||||
|
||||
# ATT: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
|
||||
# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32]
|
||||
0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
|
||||
|
||||
# ATT: ttransposed %tmm1, %tmm2
|
||||
# INTEL: ttransposed tmm2, tmm1
|
||||
0xc4,0xe2,0x7a,0x5f,0xd1
|
||||
|
||||
# ATT: ttransposed %tmm2, %tmm3
|
||||
# INTEL: ttransposed tmm3, tmm2
|
||||
0xc4,0xe2,0x7a,0x5f,0xda
|
57
llvm/test/MC/Disassembler/X86/amx-transpose-intel.s
Normal file
57
llvm/test/MC/Disassembler/X86/amx-transpose-intel.s
Normal file
@ -0,0 +1,57 @@
|
||||
// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
|
||||
|
||||
// CHECK: t2rpntlvwz0 tmm6, [rbp + 8*r14 + 268435456]
|
||||
// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10]
|
||||
t2rpntlvwz0 tmm6, [rbp + 8*r14 + 268435456]
|
||||
|
||||
// CHECK: t2rpntlvwz0 tmm2, [r8 + 4*rax + 291]
|
||||
// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
|
||||
t2rpntlvwz0 tmm2, [r8 + 4*rax + 291]
|
||||
|
||||
// CHECK: t2rpntlvwz0 tmm2, [2*rbp - 32]
|
||||
// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
|
||||
t2rpntlvwz0 tmm2, [2*rbp - 32]
|
||||
|
||||
// CHECK: t2rpntlvwz0t1 tmm6, [rbp + 8*r14 + 268435456]
|
||||
// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10]
|
||||
t2rpntlvwz0t1 tmm7, [rbp + 8*r14 + 268435456]
|
||||
|
||||
// CHECK: t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291]
|
||||
// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
|
||||
t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291]
|
||||
|
||||
// CHECK: t2rpntlvwz0t1 tmm2, [2*rbp - 32]
|
||||
// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
|
||||
t2rpntlvwz0t1 tmm2, [2*rbp - 32]
|
||||
|
||||
// CHECK: t2rpntlvwz1 tmm0, [rbp + 8*r14 + 268435456]
|
||||
// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0x84,0xf5,0x00,0x00,0x00,0x10]
|
||||
t2rpntlvwz1 tmm1, [rbp + 8*r14 + 268435456]
|
||||
|
||||
// CHECK: t2rpntlvwz1 tmm2, [r8 + 4*rax + 291]
|
||||
// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
|
||||
t2rpntlvwz1 tmm2, [r8 + 4*rax + 291]
|
||||
|
||||
// CHECK: t2rpntlvwz1 tmm2, [2*rbp - 32]
|
||||
// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
|
||||
t2rpntlvwz1 tmm2, [2*rbp - 32]
|
||||
|
||||
// CHECK: t2rpntlvwz1t1 tmm6, [rbp + 8*r14 + 268435456]
|
||||
// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10]
|
||||
t2rpntlvwz1t1 tmm6, [rbp + 8*r14 + 268435456]
|
||||
|
||||
// CHECK: t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291]
|
||||
// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
|
||||
t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291]
|
||||
|
||||
// CHECK: t2rpntlvwz1t1 tmm2, [2*rbp - 32]
|
||||
// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
|
||||
t2rpntlvwz1t1 tmm2, [2*rbp - 32]
|
||||
|
||||
// CHECK: ttransposed tmm5, tmm1
|
||||
// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9]
|
||||
ttransposed tmm5, tmm1
|
||||
|
||||
// CHECK: ttransposed tmm3, tmm2
|
||||
// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda]
|
||||
ttransposed tmm3, tmm2
|
@ -1113,7 +1113,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) {
|
||||
// Create a stack location and ensure it's tracked.
|
||||
SpillLoc SL = {getRegByName("RSP"), StackOffset::getFixed(-8)};
|
||||
SpillLocationNo SpillNo = *MTracker->getOrTrackSpillLoc(SL);
|
||||
ASSERT_EQ(MTracker->getNumLocs(), 11u); // Tracks all possible stack locs.
|
||||
ASSERT_EQ(MTracker->getNumLocs(), 13u); // Tracks all possible stack locs.
|
||||
// Locations are: RSP, stack slots from 2^3 bits wide up to 2^9 for zmm regs,
|
||||
// then slots for sub_8bit_hi and sub_16bit_hi ({8, 8} and {16, 16}).
|
||||
// Finally, one for spilt fp80 registers.
|
||||
@ -1135,7 +1135,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) {
|
||||
// There are other locations, for things like xmm0, which we're going to
|
||||
// ignore here.
|
||||
|
||||
auto [MInLocs, MOutLocs] = allocValueTables(4, 11);
|
||||
auto [MInLocs, MOutLocs] = allocValueTables(4, 13);
|
||||
|
||||
// Transfer function: start with nothing.
|
||||
SmallVector<MLocTransferMap, 1> TransferFunc;
|
||||
@ -1170,7 +1170,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) {
|
||||
// function.
|
||||
TransferFunc[1].insert({ALStackLoc, ALDefInBlk1});
|
||||
TransferFunc[1].insert({HAXStackLoc, HAXDefInBlk1});
|
||||
initValueArray(MInLocs, 4, 11);
|
||||
initValueArray(MInLocs, 4, 13);
|
||||
placeMLocPHIs(*MF, AllBlocks, MInLocs, TransferFunc);
|
||||
EXPECT_EQ(MInLocs[3][ALStackLoc.asU64()], ALPHI);
|
||||
EXPECT_EQ(MInLocs[3][AXStackLoc.asU64()], AXPHI);
|
||||
|
@ -1162,6 +1162,7 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
|
||||
TYPE("vz512mem", TYPE_MVSIBZ)
|
||||
TYPE("BNDR", TYPE_BNDR)
|
||||
TYPE("TILE", TYPE_TMM)
|
||||
TYPE("TILEPair", TYPE_TMM_PAIR)
|
||||
errs() << "Unhandled type string " << s << "\n";
|
||||
llvm_unreachable("Unhandled type string");
|
||||
}
|
||||
@ -1243,6 +1244,7 @@ RecognizableInstr::rmRegisterEncodingFromString(const std::string &s,
|
||||
ENCODING("VK64", ENCODING_RM)
|
||||
ENCODING("BNDR", ENCODING_RM)
|
||||
ENCODING("TILE", ENCODING_RM)
|
||||
ENCODING("TILEPair", ENCODING_RM)
|
||||
errs() << "Unhandled R/M register encoding " << s << "\n";
|
||||
llvm_unreachable("Unhandled R/M register encoding");
|
||||
}
|
||||
@ -1292,6 +1294,7 @@ RecognizableInstr::roRegisterEncodingFromString(const std::string &s,
|
||||
ENCODING("VK64WM", ENCODING_REG)
|
||||
ENCODING("BNDR", ENCODING_REG)
|
||||
ENCODING("TILE", ENCODING_REG)
|
||||
ENCODING("TILEPair", ENCODING_REG)
|
||||
errs() << "Unhandled reg/opcode register encoding " << s << "\n";
|
||||
llvm_unreachable("Unhandled reg/opcode register encoding");
|
||||
}
|
||||
@ -1322,6 +1325,7 @@ RecognizableInstr::vvvvRegisterEncodingFromString(const std::string &s,
|
||||
ENCODING("VK32", ENCODING_VVVV)
|
||||
ENCODING("VK64", ENCODING_VVVV)
|
||||
ENCODING("TILE", ENCODING_VVVV)
|
||||
ENCODING("TILEPair", ENCODING_VVVV)
|
||||
errs() << "Unhandled VEX.vvvv register encoding " << s << "\n";
|
||||
llvm_unreachable("Unhandled VEX.vvvv register encoding");
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user