This patch builds on 0d7286a652 by simplifying the code for detecting splat values and adding new tests demonstrating the lowering of splatted absolute value shift amounts, which are common in code generated by Halide. The lowering is very bad right now, but subsequent patches will improve it considerably. The tests will be useful for evaluating the improvements in those patches. Reviewed By: aheejin Differential Revision: https://reviews.llvm.org/D83493
104 lines
4.4 KiB
LLVM
104 lines
4.4 KiB
LLVM
; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
|
|
|
|
; Test that SIMD shifts can be lowered correctly even with shift
|
|
; values that are more complex than plain splats.
|
|
|
|
target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
|
|
target triple = "wasm32-unknown-unknown"
|
|
|
|
;; TODO: Optimize this further by scalarizing the add
|
|
|
|
; CHECK-LABEL: shl_add:
|
|
; CHECK-NEXT: .functype shl_add (v128, i32, i32) -> (v128)
|
|
; CHECK-NEXT: i8x16.splat $push1=, $1
|
|
; CHECK-NEXT: i8x16.splat $push0=, $2
|
|
; CHECK-NEXT: i8x16.add $push2=, $pop1, $pop0
|
|
; CHECK-NEXT: i8x16.extract_lane_u $push3=, $pop2, 0
|
|
; CHECK-NEXT: i8x16.shl $push4=, $0, $pop3
|
|
; CHECK-NEXT: return $pop4
|
|
define <16 x i8> @shl_add(<16 x i8> %v, i8 %a, i8 %b) {
|
|
%t1 = insertelement <16 x i8> undef, i8 %a, i32 0
|
|
%va = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer
|
|
%t2 = insertelement <16 x i8> undef, i8 %b, i32 0
|
|
%vb = shufflevector <16 x i8> %t2, <16 x i8> undef, <16 x i32> zeroinitializer
|
|
%shift = add <16 x i8> %va, %vb
|
|
%r = shl <16 x i8> %v, %shift
|
|
ret <16 x i8> %r
|
|
}
|
|
|
|
; CHECK-LABEL: shl_abs:
|
|
; CHECK-NEXT: .functype shl_abs (v128, i32) -> (v128)
|
|
; CHECK-NEXT: i8x16.extract_lane_u $push8=, $0, 0
|
|
; CHECK-NEXT: i8x16.splat $push0=, $1
|
|
; CHECK-NEXT: i8x16.abs $push98=, $pop0
|
|
; CHECK-NEXT: local.tee $push97=, $2=, $pop98
|
|
; CHECK-NEXT: i8x16.extract_lane_u $push6=, $pop97, 0
|
|
; CHECK-NEXT: i32.const $push2=, 7
|
|
; CHECK-NEXT: i32.and $push7=, $pop6, $pop2
|
|
; CHECK-NEXT: i32.shl $push9=, $pop8, $pop7
|
|
; CHECK-NEXT: i8x16.splat $push10=, $pop9
|
|
; CHECK-NEXT: i8x16.extract_lane_u $push4=, $0, 1
|
|
; CHECK-NEXT: i8x16.extract_lane_u $push1=, $2, 1
|
|
; CHECK-NEXT: i32.const $push96=, 7
|
|
; CHECK-NEXT: i32.and $push3=, $pop1, $pop96
|
|
; CHECK-NEXT: i32.shl $push5=, $pop4, $pop3
|
|
; CHECK-NEXT: i8x16.replace_lane $push11=, $pop10, 1, $pop5
|
|
; ...
|
|
; CHECK: i8x16.extract_lane_u $push79=, $0, 15
|
|
; CHECK-NEXT: i8x16.extract_lane_u $push77=, $2, 15
|
|
; CHECK-NEXT: i32.const $push82=, 7
|
|
; CHECK-NEXT: i32.and $push78=, $pop77, $pop82
|
|
; CHECK-NEXT: i32.shl $push80=, $pop79, $pop78
|
|
; CHECK-NEXT: i8x16.replace_lane $push81=, $pop76, 15, $pop80
|
|
; CHECK-NEXT: return $pop81
|
|
define <16 x i8> @shl_abs(<16 x i8> %v, i8 %a) {
|
|
%t1 = insertelement <16 x i8> undef, i8 %a, i32 0
|
|
%va = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer
|
|
%nva = sub <16 x i8> zeroinitializer, %va
|
|
%c = icmp sgt <16 x i8> %va, zeroinitializer
|
|
%shift = select <16 x i1> %c, <16 x i8> %va, <16 x i8> %nva
|
|
%r = shl <16 x i8> %v, %shift
|
|
ret <16 x i8> %r
|
|
}
|
|
|
|
; CHECK-LABEL: shl_abs_add:
|
|
; CHECK-NEXT: .functype shl_abs_add (v128, i32, i32) -> (v128)
|
|
; CHECK-NEXT: i8x16.extract_lane_u $push11=, $0, 0
|
|
; CHECK-NEXT: i8x16.splat $push1=, $1
|
|
; CHECK-NEXT: i8x16.splat $push0=, $2
|
|
; CHECK-NEXT: i8x16.add $push2=, $pop1, $pop0
|
|
; CHECK-NEXT: v8x16.shuffle $push3=, $pop2, $0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
; CHECK-NEXT: i8x16.abs $push101=, $pop3
|
|
; CHECK-NEXT: local.tee $push100=, $3=, $pop101
|
|
; CHECK-NEXT: i8x16.extract_lane_u $push9=, $pop100, 0
|
|
; CHECK-NEXT: i32.const $push5=, 7
|
|
; CHECK-NEXT: i32.and $push10=, $pop9, $pop5
|
|
; CHECK-NEXT: i32.shl $push12=, $pop11, $pop10
|
|
; CHECK-NEXT: i8x16.splat $push13=, $pop12
|
|
; CHECK-NEXT: i8x16.extract_lane_u $push7=, $0, 1
|
|
; CHECK-NEXT: i8x16.extract_lane_u $push4=, $3, 1
|
|
; CHECK-NEXT: i32.const $push99=, 7
|
|
; CHECK-NEXT: i32.and $push6=, $pop4, $pop99
|
|
; CHECK-NEXT: i32.shl $push8=, $pop7, $pop6
|
|
; CHECK-NEXT: i8x16.replace_lane $push14=, $pop13, 1, $pop8
|
|
; ...
|
|
; CHECK: i8x16.extract_lane_u $push82=, $0, 15
|
|
; CHECK-NEXT: i8x16.extract_lane_u $push80=, $3, 15
|
|
; CHECK-NEXT: i32.const $push85=, 7
|
|
; CHECK-NEXT: i32.and $push81=, $pop80, $pop85
|
|
; CHECK-NEXT: i32.shl $push83=, $pop82, $pop81
|
|
; CHECK-NEXT: i8x16.replace_lane $push84=, $pop79, 15, $pop83
|
|
; CHECK-NEXT: return $pop84
|
|
define <16 x i8> @shl_abs_add(<16 x i8> %v, i8 %a, i8 %b) {
|
|
%t1 = insertelement <16 x i8> undef, i8 %a, i32 0
|
|
%va = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer
|
|
%t2 = insertelement <16 x i8> undef, i8 %b, i32 0
|
|
%vb = shufflevector <16 x i8> %t2, <16 x i8> undef, <16 x i32> zeroinitializer
|
|
%vadd = add <16 x i8> %va, %vb
|
|
%nvadd = sub <16 x i8> zeroinitializer, %vadd
|
|
%c = icmp sgt <16 x i8> %vadd, zeroinitializer
|
|
%shift = select <16 x i1> %c, <16 x i8> %vadd, <16 x i8> %nvadd
|
|
%r = shl <16 x i8> %v, %shift
|
|
ret <16 x i8> %r
|
|
}
|