Before, we were selecting the wrong operand in cases when Scalars contained duplicate values. Stems from #135797. Using: `opt -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v t.ll` ``` target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" target triple = "riscv64" define void @foo(ptr noalias %A, ptr noalias %B) { entry: %0 = load i32, ptr %B %add = add nsw i32 %0, 1 store i32 %add, ptr %A %arrayidx.1 = getelementptr inbounds nuw i8, ptr %B, i64 4 %1 = load i32, ptr %arrayidx.1 %add.1 = add nsw i32 %1, 1 %arrayidx2.1 = getelementptr inbounds nuw i8, ptr %A, i64 4 store i32 %add.1, ptr %arrayidx2.1 %arrayidx.2 = getelementptr inbounds nuw i8, ptr %B, i64 8 %2 = load i32, ptr %arrayidx.2 %add.2 = add nsw i32 %2, 1 %arrayidx2.2 = getelementptr inbounds nuw i8, ptr %A, i64 8 store i32 %add.2, ptr %arrayidx2.2 %arrayidx.3 = getelementptr inbounds nuw i8, ptr %B, i64 12 %arrayidx2.3 = getelementptr inbounds nuw i8, ptr %A, i64 12 store i32 %add, ptr %arrayidx2.3 %arrayidx.4 = getelementptr inbounds nuw i8, ptr %B, i64 16 %4 = load i32, ptr %arrayidx.4 %add.4 = add nsw i32 %4, 1 %arrayidx2.4 = getelementptr inbounds nuw i8, ptr %A, i64 16 store i32 %add.4, ptr %arrayidx2.4 %arrayidx.5 = getelementptr inbounds nuw i8, ptr %B, i64 20 %5 = load i32, ptr %arrayidx.5 %add.5 = add nsw i32 %5, 1 %arrayidx2.5 = getelementptr inbounds nuw i8, ptr %A, i64 20 store i32 %add.5, ptr %arrayidx2.5 %arrayidx.6 = getelementptr inbounds nuw i8, ptr %B, i64 24 %6 = load i32, ptr %arrayidx.6 %add.6 = add nsw i32 %6, 1 %arrayidx2.6 = getelementptr inbounds nuw i8, ptr %A, i64 24 store i32 %add.6, ptr %arrayidx2.6 %arrayidx.7 = getelementptr inbounds nuw i8, ptr %B, i64 28 %7 = load i32, ptr %arrayidx.7 %add.7 = add nsw i32 %7, 1 %arrayidx2.7 = getelementptr inbounds nuw i8, ptr %A, i64 28 store i32 %add.7, ptr %arrayidx2.7 ret void } ``` The following trace is produced, note the wrong operand is used for `Idx > 2` Before: ``` GetScalarCost(), Idx=0 UniqueValues[Idx]: %add = add nsw i32 %0, 1 Op1: %0 = load i32, ptr %B, align 4 GetScalarCost(), Idx=1 UniqueValues[Idx]: %add.1 = add nsw i32 %1, 1 Op1: %1 = load i32, ptr %arrayidx.1, align 4 GetScalarCost(), Idx=2 UniqueValues[Idx]: %add.2 = add nsw i32 %2, 1 Op1: %2 = load i32, ptr %arrayidx.2, align 4 GetScalarCost(), Idx=3 UniqueValues[Idx]: %add.4 = add nsw i32 %3, 1 Op1: %0 = load i32, ptr %B, align 4 GetScalarCost(), Idx=4 UniqueValues[Idx]: %add.5 = add nsw i32 %4, 1 Op1: %3 = load i32, ptr %arrayidx.4, align 4 GetScalarCost(), Idx=5 UniqueValues[Idx]: %add.6 = add nsw i32 %5, 1 Op1: %4 = load i32, ptr %arrayidx.5, align 4 GetScalarCost(), Idx=6 UniqueValues[Idx]: %add.7 = add nsw i32 %6, 1 Op1: %5 = load i32, ptr %arrayidx.6, align 4 ``` After: ``` GetScalarCost(), Idx=0 UniqueValues[Idx]: %add = add nsw i32 %0, 1 Op1: %0 = load i32, ptr %B, align 4 GetScalarCost(), Idx=1 UniqueValues[Idx]: %add.1 = add nsw i32 %1, 1 Op1: %1 = load i32, ptr %arrayidx.1, align 4 GetScalarCost(), Idx=2 UniqueValues[Idx]: %add.2 = add nsw i32 %2, 1 Op1: %2 = load i32, ptr %arrayidx.2, align 4 GetScalarCost(), Idx=3 UniqueValues[Idx]: %add.4 = add nsw i32 %3, 1 Op1: %3 = load i32, ptr %arrayidx.4, align 4 GetScalarCost(), Idx=4 UniqueValues[Idx]: %add.5 = add nsw i32 %4, 1 Op1: %4 = load i32, ptr %arrayidx.5, align 4 GetScalarCost(), Idx=5 UniqueValues[Idx]: %add.6 = add nsw i32 %5, 1 Op1: %5 = load i32, ptr %arrayidx.6, align 4 GetScalarCost(), Idx=6 UniqueValues[Idx]: %add.7 = add nsw i32 %6, 1 Op1: %6 = load i32, ptr %arrayidx.7, align 4 ```
45 lines
1.7 KiB
LLVM
45 lines
1.7 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s
|
|
|
|
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
|
|
target triple = "aarch64-unknown-unknown"
|
|
|
|
define void @foo(ptr noalias %A, ptr noalias %B) {
|
|
; CHECK-LABEL: @foo(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 12
|
|
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4
|
|
; CHECK-NEXT: [[ADD_3:%.*]] = udiv i32 [[TMP2]], 2
|
|
; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr i8, ptr [[A1:%.*]], i64 8
|
|
; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr i8, ptr [[A1]], i64 12
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 4
|
|
; CHECK-NEXT: [[TMP4:%.*]] = udiv <2 x i32> [[TMP1]], <i32 3, i32 8>
|
|
; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr [[A1]], align 4
|
|
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
|
|
; CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX2_2]], align 4
|
|
; CHECK-NEXT: store i32 [[ADD_3]], ptr [[ARRAYIDX2_3]], align 4
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%arrayidx.1 = getelementptr i8, ptr %B, i64 4
|
|
%arrayidx.3 = getelementptr i8, ptr %B, i64 12
|
|
|
|
%0 = load i32, ptr %B
|
|
%1 = load i32, ptr %arrayidx.1
|
|
%3 = load i32, ptr %arrayidx.3
|
|
|
|
%add = udiv i32 %0, 3
|
|
%add.1 = udiv i32 %1, 8
|
|
%add.3 = udiv i32 %3, 2
|
|
|
|
%arrayidx2.1 = getelementptr i8, ptr %A, i64 4
|
|
%arrayidx2.2 = getelementptr i8, ptr %A, i64 8
|
|
%arrayidx2.3 = getelementptr i8, ptr %A, i64 12
|
|
|
|
store i32 %add, ptr %A
|
|
store i32 %add.1, ptr %arrayidx2.1
|
|
store i32 %add, ptr %arrayidx2.2
|
|
store i32 %add.3, ptr %arrayidx2.3
|
|
ret void
|
|
}
|