Björn Pettersson 5e5e300d07
[SelectionDAG] Fix bug related to demanded bits/elts for BITCAST (#145902)
When we have a BITCAST and the source type is a vector with smaller
elements compared to the destination type, then we need to demand all
the source elements that make up the demanded elts for the result when
doing recursive calls to SimplifyDemandedBits,
SimplifyDemandedVectorElts and SimplifyMultipleUseDemandedBits. Problem
is that those simplifications are allowed to turn non-demanded elements
of a vector into POISON, so unless we demand all source elements that
make up the result there is a risk that the result would be more
poisonous (even for demanded elts) after the simplification.

The patch fixes some bugs in SimplifyMultipleUseDemandedBits and
SimplifyDemandedBits for situations when we did not consider the problem
described above. Now we make sure that we also demand vector elements
that "must not be turned into poison" even if those elements correspond
to bits that does not need to be defined according to the DemandedBits
mask.

Fixes #138513
2026-02-23 14:38:07 +01:00

358 lines
13 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE,SSE2
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE,SSE42
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512,AVX512-V4
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512-VBMI
define i4 @reverse_cmp_v4i1(<4 x i32> %a0, <4 x i32> %a1) {
; SSE2-LABEL: reverse_cmp_v4i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: movmskps %xmm0, %eax
; SSE2-NEXT: leal (%rax,%rax), %ecx
; SSE2-NEXT: andb $4, %cl
; SSE2-NEXT: leal (,%rax,8), %edx
; SSE2-NEXT: andb $8, %dl
; SSE2-NEXT: orb %cl, %dl
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrb %cl
; SSE2-NEXT: andb $2, %cl
; SSE2-NEXT: orb %dl, %cl
; SSE2-NEXT: shrb $3, %al
; SSE2-NEXT: orb %cl, %al
; SSE2-NEXT: # kill: def $al killed $al killed $rax
; SSE2-NEXT: retq
;
; SSE42-LABEL: reverse_cmp_v4i1:
; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqd %xmm1, %xmm0
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; SSE42-NEXT: movmskps %xmm0, %eax
; SSE42-NEXT: # kill: def $al killed $al killed $eax
; SSE42-NEXT: retq
;
; AVX2-LABEL: reverse_cmp_v4i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX2-NEXT: vmovmskps %xmm0, %eax
; AVX2-NEXT: # kill: def $al killed $al killed $eax
; AVX2-NEXT: retq
;
; AVX512-LABEL: reverse_cmp_v4i1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
%cmp = icmp eq <4 x i32> %a0, %a1
%mask = bitcast <4 x i1> %cmp to i4
%rev = tail call i4 @llvm.bitreverse.i4(i4 %mask)
ret i4 %rev
}
declare i4 @llvm.bitreverse.i4(i4)
define i8 @reverse_cmp_v8i1(<8 x i16> %a0, <8 x i16> %a1) {
; SSE2-LABEL: reverse_cmp_v8i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: rolb $4, %al
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andb $51, %cl
; SSE2-NEXT: shlb $2, %cl
; SSE2-NEXT: shrb $2, %al
; SSE2-NEXT: andb $51, %al
; SSE2-NEXT: orb %cl, %al
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andb $85, %cl
; SSE2-NEXT: addb %cl, %cl
; SSE2-NEXT: shrb %al
; SSE2-NEXT: andb $85, %al
; SSE2-NEXT: orb %cl, %al
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE42-LABEL: reverse_cmp_v8i1:
; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqw %xmm1, %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
; SSE42-NEXT: packsswb %xmm0, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: # kill: def $al killed $al killed $eax
; SSE42-NEXT: retq
;
; AVX2-LABEL: reverse_cmp_v8i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,12,10,8,6,4,2,0,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: # kill: def $al killed $al killed $eax
; AVX2-NEXT: retq
;
; AVX512-LABEL: reverse_cmp_v8i1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2d %k0, %ymm0
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vmovmskps %ymm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cmp = icmp eq <8 x i16> %a0, %a1
%mask = bitcast <8 x i1> %cmp to i8
%rev = tail call i8 @llvm.bitreverse.i8(i8 %mask)
ret i8 %rev
}
declare i8 @llvm.bitreverse.i8(i8)
define i16 @reverse_cmp_v16i1(<16 x i8> %a0, <16 x i8> %a1) {
; SSE2-LABEL: reverse_cmp_v16i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: rolw $8, %ax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andl $3855, %ecx # imm = 0xF0F
; SSE2-NEXT: shll $4, %ecx
; SSE2-NEXT: shrl $4, %eax
; SSE2-NEXT: andl $3855, %eax # imm = 0xF0F
; SSE2-NEXT: orl %ecx, %eax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andl $13107, %ecx # imm = 0x3333
; SSE2-NEXT: shrl $2, %eax
; SSE2-NEXT: andl $13107, %eax # imm = 0x3333
; SSE2-NEXT: leal (%rax,%rcx,4), %eax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andl $21845, %ecx # imm = 0x5555
; SSE2-NEXT: shrl %eax
; SSE2-NEXT: andl $21845, %eax # imm = 0x5555
; SSE2-NEXT: leal (%rax,%rcx,2), %eax
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE42-LABEL: reverse_cmp_v16i1:
; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqb %xmm1, %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: # kill: def $ax killed $ax killed $eax
; SSE42-NEXT: retq
;
; AVX2-LABEL: reverse_cmp_v16i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-NEXT: retq
;
; AVX512-LABEL: reverse_cmp_v16i1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2w %k0, %ymm0
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX512-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovw2m %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cmp = icmp eq <16 x i8> %a0, %a1
%mask = bitcast <16 x i1> %cmp to i16
%rev = tail call i16 @llvm.bitreverse.i16(i16 %mask)
ret i16 %rev
}
declare i16 @llvm.bitreverse.i16(i16)
define i32 @reverse_cmp_v32i1(<32 x i8> %a0, <32 x i8> %a1) {
; SSE2-LABEL: reverse_cmp_v32i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqb %xmm2, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: pcmpeqb %xmm3, %xmm1
; SSE2-NEXT: pmovmskb %xmm1, %ecx
; SSE2-NEXT: shll $16, %ecx
; SSE2-NEXT: orl %eax, %ecx
; SSE2-NEXT: bswapl %ecx
; SSE2-NEXT: movl %ecx, %eax
; SSE2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; SSE2-NEXT: shll $4, %eax
; SSE2-NEXT: shrl $4, %ecx
; SSE2-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; SSE2-NEXT: orl %eax, %ecx
; SSE2-NEXT: movl %ecx, %eax
; SSE2-NEXT: andl $858993459, %eax # imm = 0x33333333
; SSE2-NEXT: shrl $2, %ecx
; SSE2-NEXT: andl $858993459, %ecx # imm = 0x33333333
; SSE2-NEXT: leal (%rcx,%rax,4), %eax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: andl $1431655765, %ecx # imm = 0x55555555
; SSE2-NEXT: shrl %eax
; SSE2-NEXT: andl $1431655765, %eax # imm = 0x55555555
; SSE2-NEXT: leal (%rax,%rcx,2), %eax
; SSE2-NEXT: retq
;
; SSE42-LABEL: reverse_cmp_v32i1:
; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqb %xmm2, %xmm0
; SSE42-NEXT: pcmpeqb %xmm3, %xmm1
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; SSE42-NEXT: pshufb %xmm2, %xmm1
; SSE42-NEXT: pmovmskb %xmm1, %ecx
; SSE42-NEXT: pshufb %xmm2, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: shll $16, %eax
; SSE42-NEXT: orl %ecx, %eax
; SSE42-NEXT: retq
;
; AVX2-LABEL: reverse_cmp_v32i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-V4-LABEL: reverse_cmp_v32i1:
; AVX512-V4: # %bb.0:
; AVX512-V4-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
; AVX512-V4-NEXT: vpmovm2b %k0, %ymm0
; AVX512-V4-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
; AVX512-V4-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512-V4-NEXT: vpmovmskb %ymm0, %eax
; AVX512-V4-NEXT: vzeroupper
; AVX512-V4-NEXT: retq
;
; AVX512-VBMI-LABEL: reverse_cmp_v32i1:
; AVX512-VBMI: # %bb.0:
; AVX512-VBMI-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
; AVX512-VBMI-NEXT: vpmovm2b %k0, %ymm0
; AVX512-VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX512-VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512-VBMI-NEXT: vpmovmskb %ymm0, %eax
; AVX512-VBMI-NEXT: vzeroupper
; AVX512-VBMI-NEXT: retq
%cmp = icmp eq <32 x i8> %a0, %a1
%mask = bitcast <32 x i1> %cmp to i32
%rev = tail call i32 @llvm.bitreverse.i32(i32 %mask)
ret i32 %rev
}
declare i32 @llvm.bitreverse.i32(i32)
define i64 @reverse_cmp_v64i1(<64 x i8> %a0, <64 x i8> %a1) {
; SSE2-LABEL: reverse_cmp_v64i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqb %xmm4, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: pcmpeqb %xmm5, %xmm1
; SSE2-NEXT: pmovmskb %xmm1, %ecx
; SSE2-NEXT: shll $16, %ecx
; SSE2-NEXT: orl %eax, %ecx
; SSE2-NEXT: pcmpeqb %xmm6, %xmm2
; SSE2-NEXT: pmovmskb %xmm2, %eax
; SSE2-NEXT: pcmpeqb %xmm7, %xmm3
; SSE2-NEXT: pmovmskb %xmm3, %edx
; SSE2-NEXT: shll $16, %edx
; SSE2-NEXT: orl %eax, %edx
; SSE2-NEXT: shlq $32, %rdx
; SSE2-NEXT: orq %rcx, %rdx
; SSE2-NEXT: bswapq %rdx
; SSE2-NEXT: movq %rdx, %rax
; SSE2-NEXT: shrq $4, %rax
; SSE2-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; SSE2-NEXT: andq %rcx, %rax
; SSE2-NEXT: andq %rcx, %rdx
; SSE2-NEXT: shlq $4, %rdx
; SSE2-NEXT: orq %rax, %rdx
; SSE2-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
; SSE2-NEXT: movq %rdx, %rcx
; SSE2-NEXT: andq %rax, %rcx
; SSE2-NEXT: shrq $2, %rdx
; SSE2-NEXT: andq %rax, %rdx
; SSE2-NEXT: leaq (%rdx,%rcx,4), %rax
; SSE2-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; SSE2-NEXT: movq %rax, %rdx
; SSE2-NEXT: andq %rcx, %rdx
; SSE2-NEXT: shrq %rax
; SSE2-NEXT: andq %rcx, %rax
; SSE2-NEXT: leaq (%rax,%rdx,2), %rax
; SSE2-NEXT: retq
;
; SSE42-LABEL: reverse_cmp_v64i1:
; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqb %xmm4, %xmm0
; SSE42-NEXT: pcmpeqb %xmm5, %xmm1
; SSE42-NEXT: pcmpeqb %xmm6, %xmm2
; SSE42-NEXT: pcmpeqb %xmm7, %xmm3
; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; SSE42-NEXT: pshufb %xmm4, %xmm3
; SSE42-NEXT: pmovmskb %xmm3, %eax
; SSE42-NEXT: pshufb %xmm4, %xmm2
; SSE42-NEXT: pmovmskb %xmm2, %ecx
; SSE42-NEXT: shll $16, %ecx
; SSE42-NEXT: orl %eax, %ecx
; SSE42-NEXT: pshufb %xmm4, %xmm1
; SSE42-NEXT: pmovmskb %xmm1, %edx
; SSE42-NEXT: pshufb %xmm4, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: shll $16, %eax
; SSE42-NEXT: orl %edx, %eax
; SSE42-NEXT: shlq $32, %rax
; SSE42-NEXT: orq %rcx, %rax
; SSE42-NEXT: retq
;
; AVX2-LABEL: reverse_cmp_v64i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX2-NEXT: vpmovmskb %ymm1, %ecx
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: shlq $32, %rax
; AVX2-NEXT: orq %rcx, %rax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-V4-LABEL: reverse_cmp_v64i1:
; AVX512-V4: # %bb.0:
; AVX512-V4-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512-V4-NEXT: vpmovm2b %k0, %zmm0
; AVX512-V4-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48]
; AVX512-V4-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
; AVX512-V4-NEXT: vpmovb2m %zmm0, %k0
; AVX512-V4-NEXT: kmovq %k0, %rax
; AVX512-V4-NEXT: vzeroupper
; AVX512-V4-NEXT: retq
;
; AVX512-VBMI-LABEL: reverse_cmp_v64i1:
; AVX512-VBMI: # %bb.0:
; AVX512-VBMI-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512-VBMI-NEXT: vpmovm2b %k0, %zmm0
; AVX512-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX512-VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512-VBMI-NEXT: vpmovb2m %zmm0, %k0
; AVX512-VBMI-NEXT: kmovq %k0, %rax
; AVX512-VBMI-NEXT: vzeroupper
; AVX512-VBMI-NEXT: retq
%cmp = icmp eq <64 x i8> %a0, %a1
%mask = bitcast <64 x i1> %cmp to i64
%rev = tail call i64 @llvm.bitreverse.i64(i64 %mask)
ret i64 %rev
}
declare i64 @llvm.bitreverse.i64(i64)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; SSE: {{.*}}