We've already optimised these, so update the cost model to reflect it. And skip the isBeforeLegalize check when lowering i8 muls, because it then misses the cases where, say v32i8, has been type legalised into 2x v16i8. Also explicitly disable memory interleaving for any factor other than two or four.
46 lines
1.5 KiB
LLVM
46 lines
1.5 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=wasm32 -mattr=+simd128 | FileCheck %s
|
|
|
|
define void @f(ptr %0, ptr %pr) {
|
|
; CHECK-LABEL: f:
|
|
; CHECK: .functype f (i32, i32) -> ()
|
|
; CHECK-NEXT: .local v128
|
|
; CHECK-NEXT: # %bb.0: # %BB
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: v128.const 0, 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
; CHECK-NEXT: i32.const 16
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: v128.load64_zero 0
|
|
; CHECK-NEXT: v128.const 0, 1, 0, 0
|
|
; CHECK-NEXT: i32x4.gt_u
|
|
; CHECK-NEXT: local.tee 2
|
|
; CHECK-NEXT: i32x4.extract_lane 0
|
|
; CHECK-NEXT: i32.const 1
|
|
; CHECK-NEXT: i32.and
|
|
; CHECK-NEXT: i32.shr_u
|
|
; CHECK-NEXT: i8x16.replace_lane 0
|
|
; CHECK-NEXT: i32.const 16
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i32x4.extract_lane 1
|
|
; CHECK-NEXT: i32.const 1
|
|
; CHECK-NEXT: i32.and
|
|
; CHECK-NEXT: i32.shr_u
|
|
; CHECK-NEXT: i8x16.replace_lane 1
|
|
; CHECK-NEXT: local.tee 2
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i16x8.extmul_low_i8x16_u
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
|
|
; CHECK-NEXT: v128.store16_lane 0, 0
|
|
; CHECK-NEXT: # fallthrough-return
|
|
BB:
|
|
%v0 = load <2 x i32>, ptr %0
|
|
%v1 = icmp ugt <2 x i32> %v0, <i32 0, i32 1>
|
|
%v2 = zext <2 x i1> %v1 to <2 x i8>
|
|
%v3 = ashr <2 x i8> <i8 16, i8 16>, %v2
|
|
%v4 = mul <2 x i8> %v3, %v3
|
|
store <2 x i8> %v4, ptr %pr
|
|
ret void
|
|
}
|
|
|