Revert "[SeparateConstOffsetFromGEP] Decompose constant xor operand if possible" (#179339)

A miscompile was found (see #175724), and it's complicated to fix. We're
going to revert for now, and look at reimplementing a fixed version
later.
This commit is contained in:
Eli Friedman 2026-02-03 03:33:14 -08:00 committed by GitHub
parent 139e2fb602
commit a2c7c6032f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 6 additions and 519 deletions

View File

@ -295,10 +295,6 @@ private:
bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
bool NonNegative);
/// Analyze XOR instruction to extract disjoint constant bits that behave
/// like addition operations for improved address mode folding.
APInt extractDisjointBitsFromXor(BinaryOperator *XorInst);
/// The path from the constant offset to the old GEP index. e.g., if the GEP
/// index is "a * b + (c + 5)". After running function find, UserChain[0] will
/// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
@ -601,9 +597,6 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
// Trace into subexpressions for more hoisting opportunities.
if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
// Handle XOR with disjoint bits that can be treated as addition.
else if (BO->getOpcode() == Instruction::Xor)
ConstantOffset = extractDisjointBitsFromXor(BO);
} else if (isa<TruncInst>(V)) {
ConstantOffset =
find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
@ -723,20 +716,11 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
Value *NextInChain = removeConstOffset(ChainIndex - 1);
Value *TheOther = BO->getOperand(1 - OpNo);
// If NextInChain is 0 and not the LHS of a sub, we can simplify the
// sub-expression to be just TheOther.
if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
if (CI->isZero()) {
// Custom XOR handling for disjoint bits - preserves original XOR
// with non-disjoint constant bits.
// TODO: The design should be updated to support partial constant
// extraction.
if (BO->getOpcode() == Instruction::Xor)
return BO;
// If NextInChain is 0 and not the LHS of a sub, we can simplify the
// sub-expression to be just TheOther.
if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0))
return TheOther;
}
if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
return TheOther;
}
BinaryOperator::BinaryOps NewOp = BO->getOpcode();
@ -767,67 +751,6 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
return NewBO;
}
/// Analyze XOR instruction to extract disjoint constant bits for address
/// folding
///
/// This function identifies bits in an XOR constant operand that are disjoint
/// from the base operand's known set bits. For these disjoint bits, XOR behaves
/// identically to addition, allowing us to extract them as constant offsets
/// that can be folded into addressing modes.
///
/// Transformation: `Base ^ Const` becomes `(Base ^ NonDisjointBits) +
/// DisjointBits` where DisjointBits = Const & KnownZeros(Base)
///
/// Example with ptr having known-zero low bit:
/// Original: `xor %ptr, 3` ; 3 = 0b11
/// Analysis: DisjointBits = 3 & KnownZeros(%ptr) = 0b11 & 0b01 = 0b01
/// Result: `(xor %ptr, 2) + 1` where 1 can be folded into address mode
///
/// \param XorInst The XOR binary operator to analyze
/// \return APInt containing the disjoint bits that can be extracted as offset,
/// or zero if no disjoint bits exist
APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
BinaryOperator *XorInst) {
assert(XorInst && XorInst->getOpcode() == Instruction::Xor &&
"Expected XOR instruction");
const unsigned BitWidth = XorInst->getType()->getScalarSizeInBits();
Value *BaseOperand;
ConstantInt *XorConstant;
// Match pattern: xor BaseOperand, Constant.
if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))))
return APInt::getZero(BitWidth);
// Compute known bits for the base operand.
const SimplifyQuery SQ(DL);
const KnownBits BaseKnownBits = computeKnownBits(BaseOperand, SQ);
const APInt &ConstantValue = XorConstant->getValue();
// Identify disjoint bits: constant bits that are known zero in base.
const APInt DisjointBits = ConstantValue & BaseKnownBits.Zero;
// Early exit if no disjoint bits found.
if (DisjointBits.isZero())
return APInt::getZero(BitWidth);
// Compute the remaining non-disjoint bits that stay in the XOR.
const APInt NonDisjointBits = ConstantValue & ~DisjointBits;
// FIXME: Enhance XOR constant extraction to handle nested binary operations.
// Currently we only extract disjoint bits from the immediate XOR constant,
// but we could recursively process cases like:
// xor (add %base, C1), C2 -> add %base, (C1 ^ disjoint_bits(C2))
// This requires careful analysis to ensure the transformation preserves
// semantics, particularly around sign extension and overflow behavior.
// Add the non-disjoint constant to the user chain for later transformation
// This will replace the original constant in the XOR with the new
// constant.
UserChain.push_back(ConstantInt::get(XorInst->getType(), NonDisjointBits));
return DisjointBits;
}
/// A helper function to check if reassociating through an entry in the user
/// chain would invalidate the GEP's nuw flag.
static bool allowsPreservingNUW(const User *U) {

View File

@ -1,435 +0,0 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; Test the xor with constant operand is decomposed in to gep.
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \
; RUN: -S < %s | FileCheck %s
; Test the gvn pass eliminates the redundant xor instructions from decomposition.
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep,gvn \
; RUN: -S < %s | FileCheck --check-prefix=GVN %s
; Check that disjoint constants are properly extracted and folded into GEP
; addressing modes and GVN to eliminate redundant computations
define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) {
; CHECK-LABEL: define amdgpu_kernel void @test1(
; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 8192
; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 16384
; CHECK-NEXT: [[TMP11:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 24576
; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16
; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16
; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]]
; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]]
; CHECK-NEXT: [[TMP20:%.*]] = fadd <8 x half> [[TMP18]], [[TMP19]]
; CHECK-NEXT: store <8 x half> [[TMP20]], ptr addrspace(3) [[TMP1]], align 16
; CHECK-NEXT: ret void
;
; GVN-LABEL: define amdgpu_kernel void @test1(
; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; GVN-NEXT: [[ENTRY:.*:]]
; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; GVN-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192
; GVN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 16384
; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 24576
; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP10]], [[TMP11]]
; GVN-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP12]], [[TMP13]]
; GVN-NEXT: store <8 x half> [[TMP14]], ptr addrspace(3) [[TMP1]], align 16
; GVN-NEXT: ret void
;
entry:
%2 = select i1 %0, i32 0, i32 288
%3 = xor i32 %2, 32
%4 = xor i32 %2, 4128
%5 = xor i32 %2, 8224
%6 = xor i32 %2, 12320
%7 = getelementptr half, ptr addrspace(3) %1, i32 %3
%8 = getelementptr half, ptr addrspace(3) %1, i32 %4
%9 = getelementptr half, ptr addrspace(3) %1, i32 %5
%10 = getelementptr half, ptr addrspace(3) %1, i32 %6
%11 = load <8 x half>, ptr addrspace(3) %7, align 16
%12 = load <8 x half>, ptr addrspace(3) %8, align 16
%13 = load <8 x half>, ptr addrspace(3) %9, align 16
%14 = load <8 x half>, ptr addrspace(3) %10, align 16
%15 = fadd <8 x half> %11, %12
%16 = fadd <8 x half> %13, %14
%17 = fadd <8 x half> %15, %16
store <8 x half> %17, ptr addrspace(3) %1, align 16
ret void
}
; Check that disjoint constants are properly extracted and folded into GEP
; addressing modes and GVN to eliminate redundant computations
define amdgpu_kernel void @test2(i1 %0, ptr addrspace(3) %1) {
; CHECK-LABEL: define amdgpu_kernel void @test2(
; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP5]], i32 24576
; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 16384
; CHECK-NEXT: [[TMP10:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 8192
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP9]], align 16
; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP12]], align 16
; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16
; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]]
; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]]
; CHECK-NEXT: [[TMP20:%.*]] = fadd <8 x half> [[TMP18]], [[TMP19]]
; CHECK-NEXT: store <8 x half> [[TMP20]], ptr addrspace(3) [[TMP1]], align 16
; CHECK-NEXT: ret void
;
; GVN-LABEL: define amdgpu_kernel void @test2(
; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; GVN-NEXT: [[ENTRY:.*:]]
; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; GVN-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 24576
; GVN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 16384
; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192
; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP10]], [[TMP11]]
; GVN-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP12]], [[TMP13]]
; GVN-NEXT: store <8 x half> [[TMP14]], ptr addrspace(3) [[TMP1]], align 16
; GVN-NEXT: ret void
;
entry:
%2 = select i1 %0, i32 0, i32 288
%3 = xor i32 %2, 12320
%4 = xor i32 %2, 8224
%5 = xor i32 %2, 4128
%6 = xor i32 %2, 32
%7 = getelementptr half, ptr addrspace(3) %1, i32 %3
%8 = getelementptr half, ptr addrspace(3) %1, i32 %4
%9 = getelementptr half, ptr addrspace(3) %1, i32 %5
%10 = getelementptr half, ptr addrspace(3) %1, i32 %6
%11 = load <8 x half>, ptr addrspace(3) %7, align 16
%12 = load <8 x half>, ptr addrspace(3) %8, align 16
%13 = load <8 x half>, ptr addrspace(3) %9, align 16
%14 = load <8 x half>, ptr addrspace(3) %10, align 16
%15 = fadd <8 x half> %11, %12
%16 = fadd <8 x half> %13, %14
%17 = fadd <8 x half> %15, %16
store <8 x half> %17, ptr addrspace(3) %1, align 16
ret void
}
; Verify that xor instructions with different non-disjoint constants are optimized
define amdgpu_kernel void @test3(i1 %0, ptr addrspace(3) %1) {
; CHECK-LABEL: define amdgpu_kernel void @test3(
; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096
; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 8192
; CHECK-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
; CHECK-NEXT: [[TMP12:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
; CHECK-NEXT: [[TMP13:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16
; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]]
; CHECK-NEXT: [[TMP15:%.*]] = fadd <8 x half> [[TMP13]], [[TMP14]]
; CHECK-NEXT: store <8 x half> [[TMP15]], ptr addrspace(3) [[TMP1]], align 16
; CHECK-NEXT: ret void
;
; GVN-LABEL: define amdgpu_kernel void @test3(
; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; GVN-NEXT: [[ENTRY:.*:]]
; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288
; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096
; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192
; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]]
; GVN-NEXT: store <8 x half> [[TMP13]], ptr addrspace(3) [[TMP1]], align 16
; GVN-NEXT: ret void
;
entry:
%2 = select i1 %0, i32 0, i32 288
%3 = xor i32 %2, 32
%4 = xor i32 %2, 2336
%5 = xor i32 %2, 4128
%6 = getelementptr half, ptr addrspace(3) %1, i32 %3
%7 = getelementptr half, ptr addrspace(3) %1, i32 %4
%8 = getelementptr half, ptr addrspace(3) %1, i32 %5
%9 = load <8 x half>, ptr addrspace(3) %6, align 16
%10 = load <8 x half>, ptr addrspace(3) %7, align 16
%11 = load <8 x half>, ptr addrspace(3) %8, align 16
%12 = fadd <8 x half> %9, %10
%13 = fadd <8 x half> %11, %12
store <8 x half> %13, ptr addrspace(3) %1, align 16
ret void
}
; Verify that no optimization occurs when disjoint constants are absent
define amdgpu_kernel void @test4(i1 %0, ptr addrspace(3) %1) {
; CHECK-LABEL: define amdgpu_kernel void @test4(
; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
; CHECK-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
; CHECK-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
; CHECK-NEXT: ret void
;
; GVN-LABEL: define amdgpu_kernel void @test4(
; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; GVN-NEXT: [[ENTRY:.*:]]
; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288
; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
; GVN-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
; GVN-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
; GVN-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
; GVN-NEXT: ret void
;
entry:
%2 = select i1 %0, i32 0, i32 288
%3 = xor i32 %2, 32
%4 = xor i32 %2, 288
%5 = getelementptr half, ptr addrspace(3) %1, i32 %3
%6 = getelementptr half, ptr addrspace(3) %1, i32 %4
%7 = load <8 x half>, ptr addrspace(3) %5, align 16
%8 = load <8 x half>, ptr addrspace(3) %6, align 16
%9 = fadd <8 x half> %7, %8
store <8 x half> %9, ptr addrspace(3) %1, align 16
ret void
}
; Verify that XOR-BinOp-GEP usage chains are properly optimized
define amdgpu_kernel void @test5(i1 %0, ptr addrspace(3) %1) {
; CHECK-LABEL: define amdgpu_kernel void @test5(
; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 256
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192
; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
; CHECK-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16
; CHECK-NEXT: ret void
;
; GVN-LABEL: define amdgpu_kernel void @test5(
; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; GVN-NEXT: [[ENTRY:.*:]]
; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; GVN-NEXT: [[TMP5:%.*]] = add i32 [[TMP3]], 256
; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 8192
; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
; GVN-NEXT: ret void
;
entry:
%2 = select i1 %0, i32 0, i32 288
%3 = xor i32 %2, 32
%4 = xor i32 %2, 4128
%5 = add i32 %4, 256
%6 = getelementptr half, ptr addrspace(3) %1, i32 %3
%7 = getelementptr half, ptr addrspace(3) %1, i32 %5
%8 = load <8 x half>, ptr addrspace(3) %6, align 16
%9 = load <8 x half>, ptr addrspace(3) %7, align 16
%10 = fadd <8 x half> %8, %9
store <8 x half> %10, ptr addrspace(3) %1, align 16
ret void
}
; Verify that BinOp-XOR-GEP usage chains are properly optimized.
; In the below test, make sure we stop processing the chain at xor
; and not fold the constant from add instruction in to gep. The
; constant from add can be folded and the future work will cover
; these cases.
define amdgpu_kernel void @test6(i1 %0, ptr addrspace(3) %1) {
; CHECK-LABEL: define amdgpu_kernel void @test6(
; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], 32
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192
; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
; CHECK-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16
; CHECK-NEXT: ret void
;
; GVN-LABEL: define amdgpu_kernel void @test6(
; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; GVN-NEXT: [[ENTRY:.*:]]
; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; GVN-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256
; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; GVN-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], 32
; GVN-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192
; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
; GVN-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
; GVN-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16
; GVN-NEXT: ret void
;
entry:
%2 = select i1 %0, i32 0, i32 288
%3 = xor i32 %2, 32
%4 = add i32 %2, 256
%5 = xor i32 %4, 4128
%6 = getelementptr half, ptr addrspace(3) %1, i32 %3
%7 = getelementptr half, ptr addrspace(3) %1, i32 %5
%8 = load <8 x half>, ptr addrspace(3) %6, align 16
%9 = load <8 x half>, ptr addrspace(3) %7, align 16
%10 = fadd <8 x half> %8, %9
store <8 x half> %10, ptr addrspace(3) %1, align 16
ret void
}
; Verify that BinOp-XOR-GEP usage chains with non disjoint xor works as
; intended.
define amdgpu_kernel void @test6a(i1 %0, ptr addrspace(3) %1) {
; CHECK-LABEL: define amdgpu_kernel void @test6a(
; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256
; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP4]], 288
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
; CHECK-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
; CHECK-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
; CHECK-NEXT: ret void
;
; GVN-LABEL: define amdgpu_kernel void @test6a(
; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; GVN-NEXT: [[ENTRY:.*:]]
; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; GVN-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256
; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP4]], 288
; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; GVN-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
; GVN-NEXT: ret void
;
entry:
%2 = select i1 %0, i32 0, i32 288
%3 = xor i32 %2, 32
%4 = add i32 %2, 256
%5 = xor i32 %4, 288
%6 = getelementptr half, ptr addrspace(3) %1, i32 %3
%7 = getelementptr half, ptr addrspace(3) %1, i32 %5
%8 = load <8 x half>, ptr addrspace(3) %6, align 16
%9 = load <8 x half>, ptr addrspace(3) %7, align 16
%10 = fadd <8 x half> %8, %9
store <8 x half> %10, ptr addrspace(3) %1, align 16
ret void
}
; Ensure disjoint constants exceeding addressing mode limits (e.g., 32768) are
; not extracted
define amdgpu_kernel void @test7(i1 %0, ptr addrspace(3) %1) {
; CHECK-LABEL: define amdgpu_kernel void @test7(
; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32800
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
; CHECK-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
; CHECK-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
; CHECK-NEXT: ret void
;
; GVN-LABEL: define amdgpu_kernel void @test7(
; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; GVN-NEXT: [[ENTRY:.*:]]
; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32800
; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
; GVN-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
; GVN-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
; GVN-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
; GVN-NEXT: ret void
;
entry:
%2 = select i1 %0, i32 0, i32 288
%3 = xor i32 %2, 32
%4 = xor i32 %2, 32800
%5 = getelementptr half, ptr addrspace(3) %1, i32 %3
%6 = getelementptr half, ptr addrspace(3) %1, i32 %4
%7 = load <8 x half>, ptr addrspace(3) %5, align 16
%8 = load <8 x half>, ptr addrspace(3) %6, align 16
%9 = fadd <8 x half> %7, %8
store <8 x half> %9, ptr addrspace(3) %1, align 16
ret void
}

View File

@ -32,9 +32,8 @@ define ptr @test_overflow(ptr %p, i32 %a) {
define ptr @test_xor_overflow(ptr %p, i32 range(i32 0, -2147483648) %a) {
; CHECK-LABEL: define ptr @test_xor_overflow(
; CHECK-SAME: ptr [[P:%.*]], i32 range(i32 0, -2147483648) [[A:%.*]]) {
; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[A]], 2147483647
; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[XOR1]], 2
; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[P]], i32 [[TMP1]]
; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[A]], -1
; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 [[XOR]]
; CHECK-NEXT: ret ptr [[UGLYGEP]]
;
%xor = xor i32 %a, -1