We have some discission in D99152 and llvm-dev and finially come up with a solution to add amx specific cast intrinsics. We've support the intrinsics in llvm IR. This patch is to replace bitcast with amx cast intrinsics in code emitting in FE. Differential Revision: https://reviews.llvm.org/D122567
105 lines
4.0 KiB
C
105 lines
4.0 KiB
C
// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f -target-feature +amx-int8 \
|
|
// RUN: -target-feature +amx-bf16 -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
|
|
|
|
#include <immintrin.h>
|
|
|
|
char buf[1024];
|
|
#define STRIDE 32
|
|
|
|
char buf2[1024];
|
|
|
|
// This is an example code and integration test.
|
|
void test_api(int cond, short row, short col) {
|
|
//CHECK-LABEL: @test_api
|
|
//CHECK-DAG: call x86_amx @llvm.x86.tileloadd64.internal
|
|
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
|
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
|
|
//CHECK-DAG: call x86_amx @llvm.x86.tdpbssd.internal
|
|
//CHECK-DAG: call void @llvm.x86.tilestored64.internal
|
|
__tile1024i a = {row, 8};
|
|
__tile1024i b = {8, col};
|
|
__tile1024i c = {row, col};
|
|
|
|
if (cond) {
|
|
__tile_loadd(&a, buf, STRIDE);
|
|
__tile_loadd(&b, buf, STRIDE);
|
|
__tile_loadd(&c, buf, STRIDE);
|
|
} else {
|
|
__tile_loadd(&a, buf2, STRIDE);
|
|
__tile_loadd(&b, buf2, STRIDE);
|
|
__tile_loadd(&c, buf2, STRIDE);
|
|
}
|
|
__tile_dpbssd(&c, a, b);
|
|
__tile_stored(buf, STRIDE, c);
|
|
}
|
|
|
|
void test_tile_loadd(short row, short col) {
|
|
//CHECK-LABEL: @test_tile_loadd
|
|
//CHECK-DAG: call x86_amx @llvm.x86.tileloadd64.internal
|
|
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
|
__tile1024i a = {row, col};
|
|
__tile_loadd(&a, buf, STRIDE);
|
|
}
|
|
|
|
void test_tile_stream_loadd(short row, short col) {
|
|
//CHECK-LABEL: @test_tile_stream_loadd
|
|
//CHECK-DAG: call x86_amx @llvm.x86.tileloaddt164.internal
|
|
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
|
__tile1024i a = {row, col};
|
|
__tile_stream_loadd(&a, buf, STRIDE);
|
|
}
|
|
|
|
void test_tile_dpbssd(__tile1024i a, __tile1024i b, __tile1024i c) {
|
|
//CHECK-LABEL: @test_tile_dpbssd
|
|
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
|
|
//CHECK-DAG: call x86_amx @llvm.x86.tdpbssd.internal
|
|
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
|
__tile_dpbssd(&c, a, b);
|
|
}
|
|
|
|
void test_tile_dpbsud(__tile1024i a, __tile1024i b, __tile1024i c) {
|
|
//CHECK-LABEL: @test_tile_dpbsud
|
|
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
|
|
//CHECK-DAG: call x86_amx @llvm.x86.tdpbsud.internal
|
|
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
|
__tile_dpbsud(&c, a, b);
|
|
}
|
|
|
|
void test_tile_dpbusd(__tile1024i a, __tile1024i b, __tile1024i c) {
|
|
//CHECK-LABEL: @test_tile_dpbusd
|
|
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
|
|
//CHECK-DAG: call x86_amx @llvm.x86.tdpbusd.internal
|
|
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
|
__tile_dpbusd(&c, a, b);
|
|
}
|
|
|
|
void test_tile_dpbuud(__tile1024i a, __tile1024i b, __tile1024i c) {
|
|
//CHECK-LABEL: @test_tile_dpbuud
|
|
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
|
|
//CHECK-DAG: call x86_amx @llvm.x86.tdpbuud.internal
|
|
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
|
__tile_dpbuud(&c, a, b);
|
|
}
|
|
|
|
void test_tile_stored(__tile1024i c) {
|
|
//CHECK-LABEL: @test_tile_stored
|
|
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
|
|
//CHECK-DAG: call void @llvm.x86.tilestored64.internal
|
|
__tile_stored(buf, STRIDE, c);
|
|
}
|
|
|
|
void test_tile_zero(__tile1024i c) {
|
|
//CHECK-LABEL: @test_tile_zero
|
|
//CHECK-DAG: call x86_amx @llvm.x86.tilezero.internal
|
|
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
|
__tile_zero(&c);
|
|
}
|
|
|
|
void test_tile_dpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
|
|
//CHECK-LABEL: @test_tile_dpbf16ps
|
|
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
|
|
//CHECK-DAG: call x86_amx @llvm.x86.tdpbf16ps.internal
|
|
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
|
|
__tile_dpbf16ps(&a, b, c);
|
|
}
|