Thurston Dang 2b75ff192d
[msan] Reland with even more improvement: Improve packed multiply-add instrumentation (#153353)
This reverts commit cf002847a464c004a57ca4777251b1aafc33d958 i.e.,
relands ba603b5e4d44f1a25207a2a00196471d2ba93424. It was reverted
because it was subtly wrong: multiplying an uninitialized zero should
not result in an initialized zero.

This reland fixes the issue by using instrumentation analogous to
visitAnd (bitwise AND of an initialized zero and an uninitialized value
results in an initialized value). Additionally, this reland expands a
test case; fixes the commit message; and optimizes the change to avoid
the need for horizontalReduce.

The current instrumentation has false positives: it does not take into
account that multiplying an initialized zero value with an uninitialized
value results in an initialized zero value This change fixes the issue
during the multiplication step. The horizontal add step is modeled using
bitwise OR.
    
Future work can apply this improved handler to the AVX512 equivalent
intrinsics (x86_avx512_pmaddw_d_512, x86_avx512_pmaddubs_w_512.) and AVX
VNNI intrinsics.
2025-08-15 16:35:42 -07:00

124 lines
6.9 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
; REQUIRES: x86-registered-target
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory {
; CHECK-LABEL: define <4 x i32> @Test_sse2_pmadd_wd(
; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i16> [[A]], zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i1> [[TMP2]], [[TMP12]]
; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i1> [[TMP3]], [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i1> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i1> [[TMP6]], [[TMP13]]
; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i1> [[TMP15]], [[TMP14]]
; CHECK-NEXT: [[TMP11:%.*]] = sext <8 x i1> [[TMP10]] to <8 x i16>
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i16> [[TMP11]] to <4 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = icmp ne <4 x i32> [[TMP16]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP17]] to <4 x i32>
; CHECK-NEXT: [[C:%.*]] = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A]], <8 x i16> [[B]]) #[[ATTR2:[0-9]+]]
; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x i32> [[C]]
;
entry:
%c = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b) nounwind
ret <4 x i32> %c
}
define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_memory {
; CHECK-LABEL: define <1 x i64> @Test_ssse3_pmadd_ub_sw(
; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP4]], zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[TMP17:%.*]] = icmp ne <8 x i8> [[TMP2]], zeroinitializer
; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <8 x i8> [[TMP3]], zeroinitializer
; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i1> [[TMP14]], [[TMP15]]
; CHECK-NEXT: [[TMP11:%.*]] = and <8 x i1> [[TMP17]], [[TMP15]]
; CHECK-NEXT: [[TMP12:%.*]] = and <8 x i1> [[TMP14]], [[TMP21]]
; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i1> [[TMP16]], [[TMP11]]
; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i1> [[TMP13]], [[TMP12]]
; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i1> [[TMP22]] to <8 x i8>
; CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
; CHECK-NEXT: [[TMP24:%.*]] = icmp ne <4 x i16> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP23:%.*]] = sext <4 x i1> [[TMP24]] to <4 x i16>
; CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i16> [[TMP23]] to i64
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i64 [[TMP19]] to <1 x i64>
; CHECK-NEXT: [[C:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[A]], <1 x i64> [[B]]) #[[ATTR2]]
; CHECK-NEXT: store <1 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <1 x i64> [[C]]
;
entry:
%c = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %a, <1 x i64> %b) nounwind
ret <1 x i64> %c
}
define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_memory {
; CHECK-LABEL: define <2 x i64> @Test_x86_sse2_psad_bw(
; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = lshr <2 x i64> [[TMP6]], splat (i64 48)
; CHECK-NEXT: [[C:%.*]] = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> [[A]], <16 x i8> [[B]])
; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <2 x i64> [[C]]
;
%c = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a, <16 x i8> %b)
ret <2 x i64> %c
}
define <1 x i64> @Test_x86_mmx_psad_bw(<1 x i64> %a, <1 x i64> %b) sanitize_memory {
; CHECK-LABEL: define <1 x i64> @Test_x86_mmx_psad_bw(
; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP2:%.*]] = or <1 x i64> [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
; CHECK-NEXT: [[TMP5:%.*]] = sext i1 [[TMP4]] to i64
; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 48
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64 [[TMP6]] to <1 x i64>
; CHECK-NEXT: [[C:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> [[A]], <1 x i64> [[B]]) #[[ATTR2]]
; CHECK-NEXT: store <1 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <1 x i64> [[C]]
;
entry:
%c = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %a, <1 x i64> %b) nounwind
ret <1 x i64> %c
}