Revert "[WebAssembly] Combine i128 to v16i8 for setcc & expand memcmp for 16 byte loads with simd128" (#153360)

Reverts llvm/llvm-project#149461

The first test w/ memcmp in `test/neon/test_neon_wasm_simd.cpp` in the
Emscripten test suite has failed. This PR applies a revert so I can take
a closer look at it

Test case link:
https://github.com/emscripten-core/emscripten/blob/main/test/neon/test_neon_wasm_simd.cpp

Compile option: `em++ test_neon_wasm_simd.cpp -O2 -mfpu=neon -msimd128
-o something.js`

Original comment report:
https://github.com/llvm/llvm-project/pull/149461#issuecomment-3181652746
This commit is contained in:
Jasmine Tang 2025-08-13 00:41:44 -07:00 committed by GitHub
parent 48bfaa4c06
commit d32793ca6e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 18 additions and 150 deletions

View File

@ -3383,55 +3383,8 @@ static SDValue TryMatchTrue(SDNode *N, EVT VecVT, SelectionDAG &DAG) {
return DAG.getZExtOrTrunc(Ret, DL, N->getValueType(0));
}
/// Try to convert a i128 comparison to a v16i8 comparison before type
/// legalization splits it up into chunks
static SDValue
combineVectorSizedSetCCEquality(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
const WebAssemblySubtarget *Subtarget) {
SDLoc DL(N);
SDValue X = N->getOperand(0);
SDValue Y = N->getOperand(1);
EVT VT = N->getValueType(0);
EVT OpVT = X.getValueType();
SelectionDAG &DAG = DCI.DAG;
if (DCI.DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat))
return SDValue();
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
// We're looking for an oversized integer equality comparison with SIMD
if (!OpVT.isScalarInteger() || !OpVT.isByteSized() || OpVT != MVT::i128 ||
!Subtarget->hasSIMD128() || !isIntEqualitySetCC(CC))
return SDValue();
// Don't perform this combine if constructing the vector will be expensive.
auto IsVectorBitCastCheap = [](SDValue X) {
X = peekThroughBitcasts(X);
return isa<ConstantSDNode>(X) || X.getOpcode() == ISD::LOAD;
};
if (!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y))
return SDValue();
SDValue VecX = DAG.getBitcast(MVT::v16i8, X);
SDValue VecY = DAG.getBitcast(MVT::v16i8, Y);
SDValue Cmp = DAG.getSetCC(DL, MVT::v16i8, VecX, VecY, CC);
SDValue Intr =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
{DAG.getConstant(CC == ISD::SETEQ ? Intrinsic::wasm_alltrue
: Intrinsic::wasm_anytrue,
DL, MVT::i32),
Cmp});
return DAG.getSetCC(DL, VT, Intr, DAG.getConstant(0, DL, MVT::i32), CC);
}
static SDValue performSETCCCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const WebAssemblySubtarget *Subtarget) {
TargetLowering::DAGCombinerInfo &DCI) {
if (!DCI.isBeforeLegalize())
return SDValue();
@ -3439,9 +3392,6 @@ static SDValue performSETCCCombine(SDNode *N,
if (!VT.isScalarInteger())
return SDValue();
if (SDValue V = combineVectorSizedSetCCEquality(N, DCI, Subtarget))
return V;
SDValue LHS = N->getOperand(0);
if (LHS->getOpcode() != ISD::BITCAST)
return SDValue();
@ -3621,7 +3571,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::BITCAST:
return performBitcastCombine(N, DCI);
case ISD::SETCC:
return performSETCCCombine(N, DCI, Subtarget);
return performSETCCCombine(N, DCI);
case ISD::VECTOR_SHUFFLE:
return performVECTOR_SHUFFLECombine(N, DCI);
case ISD::SIGN_EXTEND:

View File

@ -147,8 +147,7 @@ WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
Options.AllowOverlappingLoads = true;
if (ST->hasSIMD128())
Options.LoadSizes.push_back(16);
// TODO: Teach WebAssembly backend about load v128.
Options.LoadSizes.append({8, 4, 2, 1});
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
target triple = "wasm32-unknown-unknown"
@ -127,16 +127,24 @@ define i1 @memcmp_expand_8(ptr %a, ptr %b) {
ret i1 %res
}
; TODO: Should be using a single load i64x2 or equivalent in bitsizes
define i1 @memcmp_expand_16(ptr %a, ptr %b) {
; CHECK-LABEL: memcmp_expand_16:
; CHECK: .functype memcmp_expand_16 (i32, i32) -> (i32)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.load $push1=, 0($0):p2align=0
; CHECK-NEXT: v128.load $push0=, 0($1):p2align=0
; CHECK-NEXT: i8x16.eq $push2=, $pop1, $pop0
; CHECK-NEXT: i8x16.all_true $push3=, $pop2
; CHECK-NEXT: i32.eqz $push4=, $pop3
; CHECK-NEXT: return $pop4
; CHECK-NEXT: i64.load $push7=, 0($0):p2align=0
; CHECK-NEXT: i64.load $push6=, 0($1):p2align=0
; CHECK-NEXT: i64.xor $push8=, $pop7, $pop6
; CHECK-NEXT: i32.const $push0=, 8
; CHECK-NEXT: i32.add $push3=, $0, $pop0
; CHECK-NEXT: i64.load $push4=, 0($pop3):p2align=0
; CHECK-NEXT: i32.const $push11=, 8
; CHECK-NEXT: i32.add $push1=, $1, $pop11
; CHECK-NEXT: i64.load $push2=, 0($pop1):p2align=0
; CHECK-NEXT: i64.xor $push5=, $pop4, $pop2
; CHECK-NEXT: i64.or $push9=, $pop8, $pop5
; CHECK-NEXT: i64.eqz $push10=, $pop9
; CHECK-NEXT: return $pop10
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
%res = icmp eq i32 %cmp_16, 0
ret i1 %res

View File

@ -1,89 +0,0 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
target triple = "wasm32-unknown-unknown"
declare i32 @memcmp(ptr, ptr, i32)
define i1 @setcc_load(ptr %a, ptr %b) {
; CHECK-LABEL: setcc_load:
; CHECK: .functype setcc_load (i32, i32) -> (i32)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.load $push1=, 0($0):p2align=0
; CHECK-NEXT: v128.load $push0=, 0($1):p2align=0
; CHECK-NEXT: i8x16.eq $push2=, $pop1, $pop0
; CHECK-NEXT: i8x16.all_true $push3=, $pop2
; CHECK-NEXT: i32.eqz $push4=, $pop3
; CHECK-NEXT: return $pop4
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
%res = icmp eq i32 %cmp_16, 0
ret i1 %res
}
; INFO: Negative test: noimplicitfloat disables simd
define i1 @setcc_load_should_not_vectorize(ptr %a, ptr %b) noimplicitfloat {
; CHECK-LABEL: setcc_load_should_not_vectorize:
; CHECK: .functype setcc_load_should_not_vectorize (i32, i32) -> (i32)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i64.load $push4=, 0($0):p2align=0
; CHECK-NEXT: i64.load $push3=, 0($1):p2align=0
; CHECK-NEXT: i64.xor $push5=, $pop4, $pop3
; CHECK-NEXT: i64.load $push1=, 8($0):p2align=0
; CHECK-NEXT: i64.load $push0=, 8($1):p2align=0
; CHECK-NEXT: i64.xor $push2=, $pop1, $pop0
; CHECK-NEXT: i64.or $push6=, $pop5, $pop2
; CHECK-NEXT: i64.eqz $push7=, $pop6
; CHECK-NEXT: return $pop7
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
%res = icmp eq i32 %cmp_16, 0
ret i1 %res
}
define i1 @setcc_eq_const_i128(ptr %ptr) {
; CHECK-LABEL: setcc_eq_const_i128:
; CHECK: .functype setcc_eq_const_i128 (i32) -> (i32)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.load $push0=, 0($0)
; CHECK-NEXT: v128.const $push1=, 6, 0
; CHECK-NEXT: i8x16.eq $push2=, $pop0, $pop1
; CHECK-NEXT: i8x16.all_true $push3=, $pop2
; CHECK-NEXT: i32.eqz $push4=, $pop3
; CHECK-NEXT: return $pop4
%l = load i128, ptr %ptr
%res = icmp eq i128 %l, 6
ret i1 %res
}
define i1 @setcc_ne_const_i128(ptr %ptr) {
; CHECK-LABEL: setcc_ne_const_i128:
; CHECK: .functype setcc_ne_const_i128 (i32) -> (i32)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.load $push0=, 0($0)
; CHECK-NEXT: v128.const $push1=, 16, 0
; CHECK-NEXT: i8x16.ne $push2=, $pop0, $pop1
; CHECK-NEXT: v128.any_true $push3=, $pop2
; CHECK-NEXT: return $pop3
%l = load i128, ptr %ptr
%res = icmp ne i128 %l, 16
ret i1 %res
}
; INFO: Negative test: only eq and ne works
define i1 @setcc_slt_const_i128(ptr %ptr) {
; CHECK-LABEL: setcc_slt_const_i128:
; CHECK: .functype setcc_slt_const_i128 (i32) -> (i32)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i64.load $push2=, 0($0)
; CHECK-NEXT: i64.const $push3=, 25
; CHECK-NEXT: i64.lt_u $push4=, $pop2, $pop3
; CHECK-NEXT: i64.load $push8=, 8($0)
; CHECK-NEXT: local.tee $push7=, $1=, $pop8
; CHECK-NEXT: i64.const $push0=, 0
; CHECK-NEXT: i64.lt_s $push1=, $pop7, $pop0
; CHECK-NEXT: i64.eqz $push5=, $1
; CHECK-NEXT: i32.select $push6=, $pop4, $pop1, $pop5
; CHECK-NEXT: return $pop6
%l = load i128, ptr %ptr
%res = icmp slt i128 %l, 25
ret i1 %res
}