From 4a9d038acd637c5742e6d1622d4ad803059825bd Mon Sep 17 00:00:00 2001 From: Nishant Patel Date: Mon, 18 Aug 2025 09:45:29 -0700 Subject: [PATCH 01/27] [MLIR][XeGPU] Distribute load_nd/store_nd/prefetch_nd with offsets from Wg to Sg (#153432) This PR adds pattern to distribute the load/store/prefetch nd ops with offsets from workgroup to subgroup IR. This PR is part of the transition to move offsets from create_nd to load/store/prefetch nd ops. Create_nd PR : #152351 --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 18 +- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 46 ++++ .../Transforms/XeGPUWgToSgDistribute.cpp | 218 +++++++++++++++- .../XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir | 73 +++++- .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 242 ++++++++++++++++++ 5 files changed, 586 insertions(+), 11 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index abc291c81a76..eb54d6887681 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -272,6 +272,11 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { let builders = [ OpBuilder<(ins "Value": $TensorDesc, + "xegpu::CachePolicyAttr": $l1_hint, + "xegpu::CachePolicyAttr": $l2_hint, + "xegpu::CachePolicyAttr": $l3_hint)>, + OpBuilder<(ins "Value": $TensorDesc, + "ArrayRef": $offsets, "xegpu::CachePolicyAttr": $l1_hint, "xegpu::CachePolicyAttr": $l2_hint, "xegpu::CachePolicyAttr": $l3_hint)> @@ -348,6 +353,12 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ let builders = [ OpBuilder<(ins "Type": $value, "Value": $TensorDesc, + "UnitAttr": $packed, "DenseI64ArrayAttr": $transpose, + "xegpu::CachePolicyAttr": $l1_hint, + "xegpu::CachePolicyAttr": $l2_hint, + "xegpu::CachePolicyAttr": $l3_hint)>, + OpBuilder<(ins "Type": $value, "Value": $TensorDesc, + "ArrayRef": $offsets, "UnitAttr": $packed, "DenseI64ArrayAttr": $transpose, "xegpu::CachePolicyAttr": $l1_hint, "xegpu::CachePolicyAttr": $l2_hint, @@ -419,7 +430,12 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ OpBuilder<(ins "Value": $value, "Value": $TensorDesc, "xegpu::CachePolicyAttr": $l1_hint, "xegpu::CachePolicyAttr": $l2_hint, - "xegpu::CachePolicyAttr": $l3_hint)> + "xegpu::CachePolicyAttr": $l3_hint)>, + OpBuilder<(ins "Value": $value, "Value": $TensorDesc, + "ArrayRef": $offsets, + "xegpu::CachePolicyAttr": $l1_hint, + "xegpu::CachePolicyAttr": $l2_hint, + "xegpu::CachePolicyAttr": $l3_hint)> ]; diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index eee0fdc7160d..906c71d8b8da 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -385,6 +385,21 @@ void PrefetchNdOp::build(OpBuilder &builder, OperationState &state, l1_hint, l2_hint, l3_hint); } +void PrefetchNdOp::build(OpBuilder &builder, OperationState &state, + Value tensorDesc, ArrayRef offsets, + xegpu::CachePolicyAttr l1_hint, + xegpu::CachePolicyAttr l2_hint, + xegpu::CachePolicyAttr l3_hint) { + SmallVector dynamicOffsets; + SmallVector staticOffsets; + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + + auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets); + + build(builder, state, tensorDesc, dynamicOffsets, staticOffsetsAttr, l1_hint, + l2_hint, l3_hint); +} + LogicalResult PrefetchNdOp::verify() { auto tdescTy = getTensorDescType(); if (tdescTy.isScattered()) @@ -427,6 +442,22 @@ void LoadNdOp::build(OpBuilder &builder, OperationState &state, Type retType, l3_hint); } +void LoadNdOp::build(OpBuilder &builder, OperationState &state, Type retType, + Value tensorDesc, ArrayRef offsets, + UnitAttr packed, DenseI64ArrayAttr transpose, + xegpu::CachePolicyAttr l1_hint, + xegpu::CachePolicyAttr l2_hint, + xegpu::CachePolicyAttr l3_hint) { + SmallVector dynamicOffsets; + SmallVector staticOffsets; + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + + auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets); + + build(builder, state, retType, tensorDesc, dynamicOffsets, staticOffsetsAttr, + packed, transpose, l1_hint, l2_hint, l3_hint); +} + LogicalResult LoadNdOp::verify() { auto tdescTy = getTensorDescType(); auto valueTy = getType(); @@ -533,6 +564,21 @@ void StoreNdOp::build(OpBuilder &builder, OperationState &state, Value value, DenseI64ArrayAttr(), l1_hint, l2_hint, l3_hint); } +void StoreNdOp::build(OpBuilder &builder, OperationState &state, Value value, + Value tensorDesc, ArrayRef offsets, + xegpu::CachePolicyAttr l1_hint, + xegpu::CachePolicyAttr l2_hint, + xegpu::CachePolicyAttr l3_hint) { + SmallVector dynamicOffsets; + SmallVector staticOffsets; + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + + auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets); + + build(builder, state, value, tensorDesc, dynamicOffsets, staticOffsetsAttr, + l1_hint, l2_hint, l3_hint); +} + LogicalResult StoreNdOp::verify() { auto dstTy = getTensorDescType(); // Tile auto valTy = getValueType(); // Vector diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index ecec186fe3fc..8f1208e77ca5 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -182,16 +182,16 @@ struct WgToSgCreateNdOp : public OpConversionPattern { layout.dropSgLayoutAndData()); SmallVector newCreateNdOps; - SmallVector wgOffsets = op.getMixedOffsets(); + SmallVector origOffsets = op.getMixedOffsets(); for (auto tdescOffsets : *maybeTdescOffsets) { SmallVector sgOffsets; size_t rank = tdescOffsets.size(); for (size_t i = 0; i < rank; i++) { - size_t idx = wgOffsets.size() - rank + i; + size_t idx = origOffsets.size() - rank + i; Value add = rewriter.createOrFold( loc, tdescOffsets[i], - getValueOrCreateConstantIndexOp(rewriter, loc, wgOffsets[idx])); + getValueOrCreateConstantIndexOp(rewriter, loc, origOffsets[idx])); sgOffsets.push_back(add); } @@ -296,6 +296,205 @@ struct WgToSgStoreNdOp : public OpConversionPattern { } }; +// Utility function to compute global offsets for subgroup operations. +// Returns a vector of new offsets for each subgroup, given the original op's +// offsets and subgroup relative offsets. +static SmallVector> +computeOffsets(Operation *op, ArrayRef> sgOffsetsList, + ArrayRef origOffsets, + ConversionPatternRewriter &rewriter) { + SmallVector> finalOffsets; + Location loc = op->getLoc(); + for (const auto &sgOffsets : sgOffsetsList) { + SmallVector newOffsets; + size_t rank = sgOffsets.size(); + for (size_t i = 0; i < rank; i++) { + size_t idx = origOffsets.size() - rank + i; + Value add = rewriter.createOrFold( + loc, sgOffsets[i], + getValueOrCreateConstantIndexOp(rewriter, loc, origOffsets[idx])); + newOffsets.push_back(add); + } + finalOffsets.push_back(std::move(newOffsets)); + } + return finalOffsets; +} + +// Utility function to get sgShape, sgOffsetList for a given +// op. +template +LogicalResult getSgOffsets(OpTy op, AdaptorTy adaptor, + ConversionPatternRewriter &rewriter, + SmallVector &sgShape, + SmallVector> &sgOffsetList) { + int64_t offsetSize = static_cast(op.getOffsets().size()); + if (offsetSize == 0 && (!op.getConstOffsetsAttr())) + return failure(); + + Location loc = op.getLoc(); + Value tdesc = op.getTensorDesc(); + auto tdescTy = dyn_cast(tdesc.getType()); + if (!tdescTy) + return failure(); + auto layout = dyn_cast(tdescTy.getLayout()); + if (!layout) + return failure(); + + SmallVector sgLayout; + auto sgLayoutAttr = layout.getSgLayout(); + if (!sgLayoutAttr) + return rewriter.notifyMatchFailure( + op, "sgLayout attribute is required in layout"); + sgLayout = llvm::to_vector_of(sgLayoutAttr.asArrayRef()); + + ArrayRef wgShape = tdescTy.getShape(); + int count; + std::tie(sgShape, count) = getSgShapeAndCount(wgShape, layout); + + // Get the subgroup ID + Value linearSgId = + gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr); + + int64_t startOfRange = -1, endOfRange = -1; + bool sgIdRangeSpecified = isSgIdRangeSpecified(op, startOfRange, endOfRange); + + if (sgIdRangeSpecified) { + int64_t sgCount = endOfRange - startOfRange; + if (computeProduct(sgLayout) != sgCount) + return rewriter.notifyMatchFailure( + op, "sg_layout size must match the sg_id_range"); + Value startOfRangeVal = + rewriter.create(loc, startOfRange); + linearSgId = + rewriter.createOrFold(loc, linearSgId, startOfRangeVal); + } + + auto sgOffsets = layout.getOffsets(rewriter, loc, linearSgId, wgShape); + if (failed(sgOffsets)) + return failure(); + + sgOffsetList = *sgOffsets; + return success(); +} + +template +SmallVector getOffsets(OpTy op, + ConversionPatternRewriter &rewriter) { + SmallVector origOffsets; + if (auto constOffsets = op.getConstOffsetsAttr()) { + for (auto attr : constOffsets.asArrayRef()) + origOffsets.push_back(rewriter.getIndexAttr(attr)); + } + for (auto v : op.getOffsets()) + origOffsets.push_back(v); + return origOffsets; +} + +// This pattern transforms the LoadNdOp with explicit offsets to load +// subgroup data. +struct WgToSgLoadNdOpWithOffset : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(xegpu::LoadNdOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + SmallVector sgShape; + SmallVector> sgOffsetList; + + // Do the distribution from workgroup to subgroup and get subgroup offsets + if (failed(getSgOffsets(op, adaptor, rewriter, sgShape, sgOffsetList))) + return failure(); + + // Get the original workgroup offsets + SmallVector origOffsets = getOffsets(op, rewriter); + + // Calculate the final offsets for each subgroup + auto finalOffsets = computeOffsets(op, sgOffsetList, origOffsets, rewriter); + + SmallVector newLoadOps; + for (auto [offsets, tdesc] : + llvm::zip(finalOffsets, adaptor.getTensorDesc())) { + VectorType newResTy = VectorType::get( + sgShape, + dyn_cast(tdesc.getType()).getElementType()); + auto newLoadOp = rewriter.create( + op.getLoc(), newResTy, tdesc, offsets, + /*packed=*/nullptr, + /*transpose=*/nullptr, op.getL1HintAttr(), op.getL2HintAttr(), + op.getL3HintAttr()); + newLoadOps.push_back(newLoadOp); + } + rewriter.replaceOpWithMultiple(op, {newLoadOps}); + return success(); + } +}; + +// This pattern transforms the StoreNdOp with explicit offsets to store +// subgroup data. +struct WgToSgStoreNdOpWithOffset + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(xegpu::StoreNdOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + SmallVector sgShape; + SmallVector> sgOffsetList; + + // Do the distribution from workgroup to subgroup and get subgroup offsets + if (failed(getSgOffsets(op, adaptor, rewriter, sgShape, sgOffsetList))) + return failure(); + + // Get the original workgroup offsets + SmallVector origOffsets = getOffsets(op, rewriter); + + // Calculate the final offsets for each subgroup + auto finalOffsets = computeOffsets(op, sgOffsetList, origOffsets, rewriter); + + for (auto [offsets, tdesc, value] : + llvm::zip(finalOffsets, adaptor.getTensorDesc(), adaptor.getValue())) { + rewriter.create(op.getLoc(), value, tdesc, offsets, + op.getL1HintAttr(), op.getL2HintAttr(), + op.getL3HintAttr()); + } + rewriter.eraseOp(op); + return success(); + } +}; + +// This pattern transforms the PrefetchNdOp with explicit offsets to prefetch +// subgroup data. +struct WgToSgPrefetchNdOpWithOffset + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(xegpu::PrefetchNdOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + SmallVector sgShape; + SmallVector> sgOffsetList; + + // Do the distribution from workgroup to subgroup and get subgroup offsets + if (failed(getSgOffsets(op, adaptor, rewriter, sgShape, sgOffsetList))) + return failure(); + + // Get the original workgroup offsets + SmallVector origOffsets = getOffsets(op, rewriter); + + // Calculate the final offsets for each subgroup + auto finalOffsets = computeOffsets(op, sgOffsetList, origOffsets, rewriter); + + for (auto [offsets, tdesc] : + llvm::zip(finalOffsets, adaptor.getTensorDesc())) { + rewriter.create( + op.getLoc(), tdesc, offsets, op.getL1HintAttr(), op.getL2HintAttr(), + op.getL3HintAttr()); + } + rewriter.eraseOp(op); + return success(); + } +}; + /// This pattern transforms the UpdateNdOffsetOp to update the offsets of a /// subgroup descriptor. It creates an UpdateNdOffsetOp op to update the /// offsets of the new subgroup src tensor descriptors. @@ -690,12 +889,13 @@ struct WgToSgArithConstantOp : public OpConversionPattern { namespace mlir { namespace xegpu { void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) { - patterns.add( - patterns.getContext()); + patterns + .add(patterns.getContext()); } } // namespace xegpu } // namespace mlir diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir index b6f44b5bc0b6..6ff7a94d678a 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir @@ -10,5 +10,76 @@ gpu.module @test_distribution { %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> gpu.return - } + } + + // CHECK-LABEL: load_nd_tdesc_with_offset + gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) { + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] + // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + // CHECK-SAME-COUNT-4: -> vector<16x16xf32> + // CHECK-NOT: xegpu.load_nd + %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + -> vector<256x128xf32> + gpu.return + } + + // CHECK-LABEL: store_nd_with_offset + gpu.func @store_nd_with_offset(%src: memref<256x128xf32>) { + // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] + // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + // CHECK-NOT: xegpu.store_nd + %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + -> vector<256x128xf32> + xegpu.store_nd %load, %tdesc[0, 0] + : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + gpu.return + } + + // CHECK-LABEL: prefetch_nd_tdesc_with_offset + // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> + gpu.func @prefetch_nd_tdesc_with_offset(%src: memref<256x128xf32>) { + // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}] + // CHECK-SAME-COUNT-4: !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + // CHECK-NOT: xegpu.prefetch_nd + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + xegpu.prefetch_nd %tdesc[0, 0] + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + gpu.return + } + + // CHECK-LABEL: dpas + // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>) + gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) { + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16> + // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + // CHECK-NOT: xegpu.create_nd_tdesc + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> + // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + // CHECK-NOT: xegpu.create_nd_tdesc + // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} + // CHECK-SAME-COUNT-16: {layout = #xegpu.layout} + // CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK-NOT: xegpu.dpas + %tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16> + -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout> + %load_a = xegpu.load_nd %tdesc_a[0, 0] + : !xegpu.tensor_desc<256x128xf16, #xegpu.layout> + -> vector<256x128xf16> + %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x256xf16> + -> !xegpu.tensor_desc<128x256xf16, #xegpu.layout> + %load_b = xegpu.load_nd %tdesc_b[0, 0] + : !xegpu.tensor_desc<128x256xf16, #xegpu.layout> + -> vector<128x256xf16> + %dpas = xegpu.dpas %load_a, %load_b + {layout_result_0 = #xegpu.layout} + : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32> + gpu.return + } } diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 025d48e22307..07a0b86223c3 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -1,5 +1,7 @@ // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s +//CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)> +//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)> gpu.module @test_distribution { // CHECK-LABEL: create_nd_tdesc_no_offset // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> @@ -21,4 +23,244 @@ gpu.module @test_distribution { -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> gpu.return } + + // CHECK-LABEL: load_nd_tdesc_with_offset + // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> + gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) { + //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index + //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] + //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] + //CHECK: %[[LOAD:.*]] = xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> -> vector<32x32xf32> + %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + -> vector<256x128xf32> + gpu.return + } + + // CHECK-LABEL: store_nd_with_offsets + // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> + gpu.func @store_nd_with_offsets(%src: memref<256x128xf32>) { + //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index + //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] + //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] + //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + -> vector<256x128xf32> + xegpu.store_nd %load, %tdesc[0, 0] + : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + gpu.return +} + + // CHECK-LABEL: prefetch_nd_tdesc_with_offset + // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> + gpu.func @prefetch_nd_tdesc_with_offset(%src: memref<256x128xf32>) { + //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index + //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] + //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] + //CHECK: xegpu.prefetch_nd %{{.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + %cst0 = arith.constant 0 : index + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + xegpu.prefetch_nd %tdesc[%cst0, %cst0] + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + gpu.return + } + + // CHECK-LABEL: dpas + gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> + %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16> + -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + %load_a = xegpu.load_nd %tdesc_a[0, 0] + : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + -> vector<128x128xf16> + %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16> + -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + %load_b = xegpu.load_nd %tdesc_b[0, 0] + : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + -> vector<128x128xf16> + %dpas = xegpu.dpas %load_a, %load_b + {layout_result_0 = #xegpu.layout} + : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> + gpu.return + } + + // CHECK-LABEL: dpas_no_sg_data + gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16> + -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + %load_a = xegpu.load_nd %tdesc_a[0, 0] + : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + -> vector<128x128xf16> + %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16> + -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + %load_b = xegpu.load_nd %tdesc_b[0, 0] + : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + -> vector<128x128xf16> + %dpas = xegpu.dpas %load_a, %load_b + {layout_result_0 = #xegpu.layout} + : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> + gpu.return + } + + // CHECK-LABEL: dpas_with_no_create_nd_desc + gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) { + // CHECK-NOT: vector<32x32xf32> + %dpas = xegpu.dpas %a, %b + {layout = #xegpu.layout} + : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32> + gpu.return + } + + // CHECK-LABEL: broadcast_dim1 + // CHECK-SAME: %[[ARG_0:.*]]: memref<256x1xf32> + gpu.func @broadcast_dim1(%src: memref<256x1xf32>) { + %tdesc = xegpu.create_nd_tdesc %src : memref<256x1xf32> + -> !xegpu.tensor_desc<256x1xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<256x1xf32, #xegpu.layout> + -> vector<256x1xf32> + // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout} + // CHECK-SAME: : vector<32x1xf32> to vector<32x32xf32> + %broadcast = vector.broadcast %load + {layout_result_0 = #xegpu.layout} + : vector<256x1xf32> to vector<256x32xf32> + gpu.return + } + + // CHECK-LABEL: broadcast_dim0 + // CHECK-SAME: %[[ARG_0:.*]]: memref<1x128xf32> + gpu.func @broadcast_dim0(%src: memref<1x128xf32>) { + %tdesc = xegpu.create_nd_tdesc %src : memref<1x128xf32> + -> !xegpu.tensor_desc<1x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<1x128xf32, #xegpu.layout> + -> vector<1x128xf32> + // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout} + // CHECK-SAME: : vector<1x32xf32> to vector<32x32xf32> + %broadcast = vector.broadcast %load + {layout_result_0 = #xegpu.layout} + : vector<1x128xf32> to vector<32x128xf32> + gpu.return + } + + // CHECK-LABEL: gemm_with_load_store_offset + // CHECK-SAME: %[[ARG_0:.*]]: memref<1024x1024xf16>, %[[ARG_1:.*]]: memref<1024x1024xf16>, %[[ARG_2:.*]]: memref<1024x1024xf32> + gpu.func @gemm_with_load_store_offset(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) { + //CHECK: [[c0:%.+]] = arith.constant 0 : index + //CHECK: [[c128:%.+]] = arith.constant 128 : index + //CHECK: [[c1024:%.+]] = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %0 = arith.muli %block_id_x, %c128 : index + %1 = arith.muli %block_id_y, %c128 : index + %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + // CHECK: [[DESC_A:%.+]] = xegpu.create_nd_tdesc %[[ARG_0]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x128xf16> + // CHECK: [[DESC_B:%.+]] = xegpu.create_nd_tdesc %[[ARG_1]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x16xf16> + %3 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + %4 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + // load_nd with offset + %5 = xegpu.load_nd %2[%0, %1] : !xegpu.tensor_desc<128x128xf32, #xegpu.layout> -> vector<128x128xf32> + %6 = xegpu.load_nd %3[%0, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %7 = xegpu.load_nd %4[%c0, %1] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + // scf.for loop + // CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]] + // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) -> + // CHECK-SAME: (vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32>) + // CHECK: [[c:%.+]] = xegpu.dpas [[arg4]], [[arg5]], [[arg6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32> + // CHECK: [[a:%.+]] = xegpu.load_nd [[DESC_A]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16> + // CHECK: [[b:%.+]] = xegpu.load_nd [[DESC_B]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16> + // CHECK: scf.yield [[a]], [[b]], [[c]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> + %8:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %6, %arg5 = %7, %arg6 = %5) + -> (vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>) { + // load_nd with offset inside loop + %9 = xegpu.dpas %arg4, %arg5, %arg6 {layout_result_0 = #xegpu.layout} + : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32> + %10 = xegpu.load_nd %3[%arg3, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %11 = xegpu.load_nd %4[%c0, %arg3] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + scf.yield %10, %11, %9 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> + } + // store_nd with offset + xegpu.store_nd %8#2, %2[%0, %1] : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + gpu.return + } + + // CHECK-LABEL: @subgroup_id_range + gpu.func @subgroup_id_range(%src: memref<256x128xf32>, %src1: memref<128x256xf32>, %src2: memref<128x64xf32>) { + %sg_id = gpu.subgroup_id : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c31 = arith.constant 31 : index + %c3 = arith.constant 3 : index + %cond1 = arith.cmpi sge, %sg_id, %c0 : index + %cond2 = arith.cmpi slt, %sg_id, %c1 : index + %cond = arith.andi %cond1, %cond2 : i1 + scf.if %cond { + // CHECK-NOT: index.sub + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + -> vector<256x128xf32> + } {sg_id_range = #xegpu.range<[0, 32]>} + %cond3 = arith.cmpi sge, %sg_id, %c2 : index + %cond4 = arith.cmpi slt, %sg_id, %c31 : index + %cond5 = arith.andi %cond3, %cond4 : i1 + scf.if %cond5 { + // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK: %[[C2:.*]] = arith.constant 2 : index + // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]] + %tdesc = xegpu.create_nd_tdesc %src2 : memref<128x64xf32> + -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<128x64xf32, #xegpu.layout> + -> vector<128x64xf32> + %exp = math.exp %load {layout_result_0 = #xegpu.layout} : vector<128x64xf32> + }{sg_id_range = #xegpu.range<[2, 18]>} + gpu.return + } + + // CHECK-LABEL: @subgroup_id_range_nested_if + gpu.func @subgroup_id_range_nested_if(%src: memref<256x128xf32>, %src1: memref<128x64xf32>) { + %sg_id = gpu.subgroup_id : index + %c1 = arith.constant 1 : i1 + %c3 = arith.constant 3 : index + %c32 = arith.constant 32 : index + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + -> vector<256x128xf32> + %cond1 = arith.cmpi sge, %sg_id, %c3 : index + %cond2 = arith.cmpi slt, %sg_id, %c32 : index + %cond = arith.andi %cond1, %cond2 : i1 + scf.if %c1 { + scf.if %cond { + // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK: %[[C3:.*]] = arith.constant 3 : index + // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C3]] + %td = xegpu.create_nd_tdesc %src1 : memref<128x64xf32> + -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> + %ld = xegpu.load_nd %td[0, 0] + : !xegpu.tensor_desc<128x64xf32, #xegpu.layout> + -> vector<128x64xf32> + %exp = math.exp %ld {layout_result_0 = #xegpu.layout} : vector<128x64xf32> + } + } {sg_id_range = #xegpu.range<[3, 19]>} + gpu.return + } } From 1b60236200735abc39e5bd3a2280123e9789dec5 Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Mon, 18 Aug 2025 18:45:52 +0200 Subject: [PATCH 02/27] [SimplifyCFG] Avoid redundant calls in gather. (NFC) (#154133) Split out from https://github.com/llvm/llvm-project/pull/154007 as it showed compile time improvements NFC as there needs to be at least two icmps that is part of the chain. --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 28 ++++++++++++----------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 0ca7188470d8..055e8cadaab7 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -565,6 +565,9 @@ struct ConstantComparesGatherer { /// Number of comparisons matched in the and/or chain unsigned UsedICmps = 0; + /// If the elements in Vals matches the comparisons + bool IsEq = false; + /// Construct and compute the result for the comparison instruction Cond ConstantComparesGatherer(Instruction *Cond, const DataLayout &DL) : DL(DL) { gather(Cond); @@ -736,23 +739,23 @@ private: /// vector. /// One "Extra" case is allowed to differ from the other. void gather(Value *V) { - bool isEQ = match(V, m_LogicalOr(m_Value(), m_Value())); - + Value *Op0, *Op1; + if (match(V, m_LogicalOr(m_Value(Op0), m_Value(Op1)))) + IsEq = true; + else if (match(V, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) + IsEq = false; + else + return; // Keep a stack (SmallVector for efficiency) for depth-first traversal - SmallVector DFT; - SmallPtrSet Visited; - - // Initialize - Visited.insert(V); - DFT.push_back(V); + SmallVector DFT{Op0, Op1}; + SmallPtrSet Visited{V, Op0, Op1}; while (!DFT.empty()) { V = DFT.pop_back_val(); if (Instruction *I = dyn_cast(V)) { // If it is a || (or && depending on isEQ), process the operands. - Value *Op0, *Op1; - if (isEQ ? match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))) + if (IsEq ? match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))) : match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) { if (Visited.insert(Op1).second) DFT.push_back(Op1); @@ -763,7 +766,7 @@ private: } // Try to match the current instruction - if (matchInstruction(I, isEQ)) + if (matchInstruction(I, IsEq)) // Match succeed, continue the loop continue; } @@ -5103,6 +5106,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, Value *CompVal = ConstantCompare.CompValue; unsigned UsedICmps = ConstantCompare.UsedICmps; Value *ExtraCase = ConstantCompare.Extra; + bool TrueWhenEqual = ConstantCompare.IsEq; // If we didn't have a multiply compared value, fail. if (!CompVal) @@ -5112,8 +5116,6 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, if (UsedICmps <= 1) return false; - bool TrueWhenEqual = match(Cond, m_LogicalOr(m_Value(), m_Value())); - // There might be duplicate constants in the list, which the switch // instruction can't handle, remove them now. array_pod_sort(Values.begin(), Values.end(), constantIntSortPredicate); From 97f554249c564e769956abfcb3266925745482c5 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 18 Aug 2025 17:48:42 +0100 Subject: [PATCH 03/27] [VPlan] Preserve nusw in createInBoundsPtrAdd (#151549) Rename createInBoundsPtrAdd to createNoWrapPtrAdd, and preserve nusw as well as inbounds at the callsite. --- .../Vectorize/LoopVectorizationPlanner.h | 14 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 4 +- ...aved-accesses-different-insert-position.ll | 2 +- .../interleaved-accesses-gep-nowrap-flags.ll | 148 ++++++++++++++++++ 4 files changed, 158 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 4856ebebb596..838476dcae66 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -256,13 +256,15 @@ public: new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, GEPNoWrapFlags::none(), DL, Name)); } - VPInstruction *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, - DebugLoc DL = DebugLoc::getUnknown(), - const Twine &Name = "") { - return tryInsertInstruction( - new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, - GEPNoWrapFlags::inBounds(), DL, Name)); + + VPInstruction *createNoWrapPtrAdd(VPValue *Ptr, VPValue *Offset, + GEPNoWrapFlags GEPFlags, + DebugLoc DL = DebugLoc::getUnknown(), + const Twine &Name = "") { + return tryInsertInstruction(new VPInstruction( + VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, DL, Name)); } + VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 05c12b7a1adc..14532244d574 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2615,9 +2615,7 @@ void VPlanTransforms::createInterleaveGroups( VPValue *OffsetVPV = Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset)); VPBuilder B(InsertPos); - Addr = NW.isInBounds() - ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV) - : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV); + Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW); } // If the group is reverse, adjust the index to refer to the last vector // lane instead of the first. We adjust the index from the first vector diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll index fa339f45fcdd..dd6b829fcb5c 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll @@ -86,7 +86,7 @@ define void @test_ig_insert_pos_at_end_of_vpbb(ptr noalias %dst, ptr noalias %sr ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr nusw { i16, i16, i16, i16 }, ptr [[SRC]], i64 [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 -4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr nusw i8, ptr [[TMP4]], i32 -4 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <4 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-gep-nowrap-flags.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-gep-nowrap-flags.ll index 552f6a4ec62d..a6ba29ed1ca0 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-gep-nowrap-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-gep-nowrap-flags.ll @@ -185,3 +185,151 @@ loop: exit: ret void } + +define void @nusw_preservation_2(ptr %src, ptr noalias %dst) { +; CHECK-LABEL: define void @nusw_preservation_2( +; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr nusw i8, ptr [[SRC]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr nusw i8, ptr [[TMP1]], i32 -1 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i8> [[WIDE_VEC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i8> [[WIDE_VEC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i8> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr nusw i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: store <4 x i8> [[TMP3]], ptr [[TMP4]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: ; preds = %loop, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv2 = phi i64 [ 0, %entry ], [ %iv2.next, %loop ] + %or.1 = or disjoint i64 %iv2, 1 + %gep.src.or.1 = getelementptr nusw i8, ptr %src, i64 %or.1 + %load.src.1 = load i8, ptr %gep.src.or.1, align 1 + %gep.src.iv2 = getelementptr nusw i8, ptr %src, i64 %iv2 + %load.src.2 = load i8, ptr %gep.src.iv2, align 1 + %add = add i8 %load.src.1, %load.src.2 + %gep.dst.iv = getelementptr nusw i8, ptr %dst, i64 %iv + store i8 %add, ptr %gep.dst.iv, align 1 + %iv2.next = add i64 %iv2, 2 + %iv.next = add i64 %iv, 1 + %exit.cond = icmp eq i64 %iv.next, 100 + br i1 %exit.cond, label %exit, label %loop + +exit: + ret void +} + +define void @inbounds_preservation_2(ptr %src, ptr noalias %dst) { +; CHECK-LABEL: define void @inbounds_preservation_2( +; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -1 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i8> [[WIDE_VEC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i8> [[WIDE_VEC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i8> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: store <4 x i8> [[TMP3]], ptr [[TMP4]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: ; preds = %loop, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv2 = phi i64 [ 0, %entry ], [ %iv2.next, %loop ] + %or.1 = or disjoint i64 %iv2, 1 + %gep.src.or.1 = getelementptr inbounds i8, ptr %src, i64 %or.1 + %load.src.1 = load i8, ptr %gep.src.or.1, align 1 + %gep.src.iv2 = getelementptr inbounds i8, ptr %src, i64 %iv2 + %load.src.2 = load i8, ptr %gep.src.iv2, align 1 + %add = add i8 %load.src.1, %load.src.2 + %gep.dst.iv = getelementptr inbounds i8, ptr %dst, i64 %iv + store i8 %add, ptr %gep.dst.iv, align 1 + %iv2.next = add i64 %iv2, 2 + %iv.next = add i64 %iv, 1 + %exit.cond = icmp eq i64 %iv.next, 100 + br i1 %exit.cond, label %exit, label %loop + +exit: + ret void +} + +define void @nuw_drop_2(ptr %src, ptr noalias %dst) { +; CHECK-LABEL: define void @nuw_drop_2( +; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr nuw i8, ptr [[SRC]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 -1 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i8> [[WIDE_VEC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i8> [[WIDE_VEC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i8> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr nuw i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: store <4 x i8> [[TMP3]], ptr [[TMP4]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: ; preds = %loop, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv2 = phi i64 [ 0, %entry ], [ %iv2.next, %loop ] + %or.1 = or disjoint i64 %iv2, 1 + %gep.src.or.1 = getelementptr nuw i8, ptr %src, i64 %or.1 + %load.src.1 = load i8, ptr %gep.src.or.1, align 1 + %gep.src.iv2 = getelementptr nuw i8, ptr %src, i64 %iv2 + %load.src.2 = load i8, ptr %gep.src.iv2, align 1 + %add = add i8 %load.src.1, %load.src.2 + %gep.dst.iv = getelementptr nuw i8, ptr %dst, i64 %iv + store i8 %add, ptr %gep.dst.iv, align 1 + %iv2.next = add i64 %iv2, 2 + %iv.next = add i64 %iv, 1 + %exit.cond = icmp eq i64 %iv.next, 100 + br i1 %exit.cond, label %exit, label %loop + +exit: + ret void +} + From 8135b7c1abd7d22f98cf3dbd7d7a93c9fc7755c6 Mon Sep 17 00:00:00 2001 From: Tobias Stadler Date: Mon, 18 Aug 2025 18:04:53 +0100 Subject: [PATCH 04/27] [LV] Emit all remarks for unvectorizable instructions (#153833) If ExtraAnalysis is requested, emit all remarks caused by unvectorizable instructions - instead of only the first. This is in line with how other places handle DoExtraAnalysis and it can be quite helpful to get info about all instructions in a loop that prevent vectorization. --- .../Vectorize/LoopVectorizationLegality.h | 3 + .../Vectorize/LoopVectorizationLegality.cpp | 508 +++++++++--------- .../X86/vectorization-remarks-missed.ll | 36 ++ 3 files changed, 301 insertions(+), 246 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index 43ff084816d1..48ee93acbe00 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -493,6 +493,9 @@ private: /// and we only need to check individual instructions. bool canVectorizeInstrs(); + /// Check if an individual instruction is vectorizable. + bool canVectorizeInstr(Instruction &I); + /// When we vectorize loops we may change the order in which /// we read and write from memory. This method checks if it is /// legal to vectorize the code, considering only memory constrains. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index c47fd9421fdd..789047a2a28e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -793,267 +793,30 @@ static bool canWidenCallReturnType(Type *Ty) { } bool LoopVectorizationLegality::canVectorizeInstrs() { - BasicBlock *Header = TheLoop->getHeader(); + bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + bool Result = true; // For each block in the loop. for (BasicBlock *BB : TheLoop->blocks()) { // Scan the instructions in the block and look for hazards. for (Instruction &I : *BB) { - if (auto *Phi = dyn_cast(&I)) { - Type *PhiTy = Phi->getType(); - // Check that this PHI type is allowed. - if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && - !PhiTy->isPointerTy()) { - reportVectorizationFailure("Found a non-int non-pointer PHI", - "loop control flow is not understood by vectorizer", - "CFGNotUnderstood", ORE, TheLoop); - return false; - } - - // If this PHINode is not in the header block, then we know that we - // can convert it to select during if-conversion. No need to check if - // the PHIs in this block are induction or reduction variables. - if (BB != Header) { - // Non-header phi nodes that have outside uses can be vectorized. Add - // them to the list of allowed exits. - // Unsafe cyclic dependencies with header phis are identified during - // legalization for reduction, induction and fixed order - // recurrences. - AllowedExit.insert(&I); - continue; - } - - // We only allow if-converted PHIs with exactly two incoming values. - if (Phi->getNumIncomingValues() != 2) { - reportVectorizationFailure("Found an invalid PHI", - "loop control flow is not understood by vectorizer", - "CFGNotUnderstood", ORE, TheLoop, Phi); - return false; - } - - RecurrenceDescriptor RedDes; - if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, - DT, PSE.getSE())) { - Requirements->addExactFPMathInst(RedDes.getExactFPMathInst()); - AllowedExit.insert(RedDes.getLoopExitInstr()); - Reductions[Phi] = RedDes; - continue; - } - - // We prevent matching non-constant strided pointer IVS to preserve - // historical vectorizer behavior after a generalization of the - // IVDescriptor code. The intent is to remove this check, but we - // have to fix issues around code quality for such loops first. - auto IsDisallowedStridedPointerInduction = - [](const InductionDescriptor &ID) { - if (AllowStridedPointerIVs) - return false; - return ID.getKind() == InductionDescriptor::IK_PtrInduction && - ID.getConstIntStepValue() == nullptr; - }; - - // TODO: Instead of recording the AllowedExit, it would be good to - // record the complementary set: NotAllowedExit. These include (but may - // not be limited to): - // 1. Reduction phis as they represent the one-before-last value, which - // is not available when vectorized - // 2. Induction phis and increment when SCEV predicates cannot be used - // outside the loop - see addInductionPhi - // 3. Non-Phis with outside uses when SCEV predicates cannot be used - // outside the loop - see call to hasOutsideLoopUser in the non-phi - // handling below - // 4. FixedOrderRecurrence phis that can possibly be handled by - // extraction. - // By recording these, we can then reason about ways to vectorize each - // of these NotAllowedExit. - InductionDescriptor ID; - if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) && - !IsDisallowedStridedPointerInduction(ID)) { - addInductionPhi(Phi, ID, AllowedExit); - Requirements->addExactFPMathInst(ID.getExactFPMathInst()); - continue; - } - - if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) { - AllowedExit.insert(Phi); - FixedOrderRecurrences.insert(Phi); - continue; - } - - // As a last resort, coerce the PHI to a AddRec expression - // and re-try classifying it a an induction PHI. - if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) && - !IsDisallowedStridedPointerInduction(ID)) { - addInductionPhi(Phi, ID, AllowedExit); - continue; - } - - reportVectorizationFailure("Found an unidentified PHI", - "value that could not be identified as " - "reduction is used outside the loop", - "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi); + Result &= canVectorizeInstr(I); + if (!DoExtraAnalysis && !Result) return false; - } // end of PHI handling - - // We handle calls that: - // * Have a mapping to an IR intrinsic. - // * Have a vector version available. - auto *CI = dyn_cast(&I); - - if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && - !(CI->getCalledFunction() && TLI && - (!VFDatabase::getMappings(*CI).empty() || - isTLIScalarize(*TLI, *CI)))) { - // If the call is a recognized math libary call, it is likely that - // we can vectorize it given loosened floating-point constraints. - LibFunc Func; - bool IsMathLibCall = - TLI && CI->getCalledFunction() && - CI->getType()->isFloatingPointTy() && - TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) && - TLI->hasOptimizedCodeGen(Func); - - if (IsMathLibCall) { - // TODO: Ideally, we should not use clang-specific language here, - // but it's hard to provide meaningful yet generic advice. - // Also, should this be guarded by allowExtraAnalysis() and/or be part - // of the returned info from isFunctionVectorizable()? - reportVectorizationFailure( - "Found a non-intrinsic callsite", - "library call cannot be vectorized. " - "Try compiling with -fno-math-errno, -ffast-math, " - "or similar flags", - "CantVectorizeLibcall", ORE, TheLoop, CI); - } else { - reportVectorizationFailure("Found a non-intrinsic callsite", - "call instruction cannot be vectorized", - "CantVectorizeLibcall", ORE, TheLoop, CI); - } - return false; - } - - // Some intrinsics have scalar arguments and should be same in order for - // them to be vectorized (i.e. loop invariant). - if (CI) { - auto *SE = PSE.getSE(); - Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); - for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx) - if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx, TTI)) { - if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)), - TheLoop)) { - reportVectorizationFailure("Found unvectorizable intrinsic", - "intrinsic instruction cannot be vectorized", - "CantVectorizeIntrinsic", ORE, TheLoop, CI); - return false; - } - } - } - - // If we found a vectorized variant of a function, note that so LV can - // make better decisions about maximum VF. - if (CI && !VFDatabase::getMappings(*CI).empty()) - VecCallVariantsFound = true; - - auto CanWidenInstructionTy = [](Instruction const &Inst) { - Type *InstTy = Inst.getType(); - if (!isa(InstTy)) - return canVectorizeTy(InstTy); - - // For now, we only recognize struct values returned from calls where - // all users are extractvalue as vectorizable. All element types of the - // struct must be types that can be widened. - return isa(Inst) && canWidenCallReturnType(InstTy) && - all_of(Inst.users(), IsaPred); - }; - - // Check that the instruction return type is vectorizable. - // We can't vectorize casts from vector type to scalar type. - // Also, we can't vectorize extractelement instructions. - if (!CanWidenInstructionTy(I) || - (isa(I) && - !VectorType::isValidElementType(I.getOperand(0)->getType())) || - isa(I)) { - reportVectorizationFailure("Found unvectorizable type", - "instruction return type cannot be vectorized", - "CantVectorizeInstructionReturnType", ORE, TheLoop, &I); - return false; - } - - // Check that the stored type is vectorizable. - if (auto *ST = dyn_cast(&I)) { - Type *T = ST->getValueOperand()->getType(); - if (!VectorType::isValidElementType(T)) { - reportVectorizationFailure("Store instruction cannot be vectorized", - "CantVectorizeStore", ORE, TheLoop, ST); - return false; - } - - // For nontemporal stores, check that a nontemporal vector version is - // supported on the target. - if (ST->getMetadata(LLVMContext::MD_nontemporal)) { - // Arbitrarily try a vector of 2 elements. - auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2); - assert(VecTy && "did not find vectorized version of stored type"); - if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) { - reportVectorizationFailure( - "nontemporal store instruction cannot be vectorized", - "CantVectorizeNontemporalStore", ORE, TheLoop, ST); - return false; - } - } - - } else if (auto *LD = dyn_cast(&I)) { - if (LD->getMetadata(LLVMContext::MD_nontemporal)) { - // For nontemporal loads, check that a nontemporal vector version is - // supported on the target (arbitrarily try a vector of 2 elements). - auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2); - assert(VecTy && "did not find vectorized version of load type"); - if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) { - reportVectorizationFailure( - "nontemporal load instruction cannot be vectorized", - "CantVectorizeNontemporalLoad", ORE, TheLoop, LD); - return false; - } - } - - // FP instructions can allow unsafe algebra, thus vectorizable by - // non-IEEE-754 compliant SIMD units. - // This applies to floating-point math operations and calls, not memory - // operations, shuffles, or casts, as they don't change precision or - // semantics. - } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && - !I.isFast()) { - LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); - Hints->setPotentiallyUnsafe(); - } - - // Reduction instructions are allowed to have exit users. - // All other instructions must not have external users. - if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { - // We can safely vectorize loops where instructions within the loop are - // used outside the loop only if the SCEV predicates within the loop is - // same as outside the loop. Allowing the exit means reusing the SCEV - // outside the loop. - if (PSE.getPredicate().isAlwaysTrue()) { - AllowedExit.insert(&I); - continue; - } - reportVectorizationFailure("Value cannot be used outside the loop", - "ValueUsedOutsideLoop", ORE, TheLoop, &I); - return false; - } - } // next instr. + } } if (!PrimaryInduction) { if (Inductions.empty()) { - reportVectorizationFailure("Did not find one integer induction var", + reportVectorizationFailure( + "Did not find one integer induction var", "loop induction variable could not be identified", "NoInductionVariable", ORE, TheLoop); return false; } if (!WidestIndTy) { - reportVectorizationFailure("Did not find one integer induction var", + reportVectorizationFailure( + "Did not find one integer induction var", "integer loop induction variable could not be identified", "NoIntegerInductionVariable", ORE, TheLoop); return false; @@ -1067,6 +830,259 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) PrimaryInduction = nullptr; + return Result; +} + +bool LoopVectorizationLegality::canVectorizeInstr(Instruction &I) { + BasicBlock *BB = I.getParent(); + BasicBlock *Header = TheLoop->getHeader(); + + if (auto *Phi = dyn_cast(&I)) { + Type *PhiTy = Phi->getType(); + // Check that this PHI type is allowed. + if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && + !PhiTy->isPointerTy()) { + reportVectorizationFailure( + "Found a non-int non-pointer PHI", + "loop control flow is not understood by vectorizer", + "CFGNotUnderstood", ORE, TheLoop); + return false; + } + + // If this PHINode is not in the header block, then we know that we + // can convert it to select during if-conversion. No need to check if + // the PHIs in this block are induction or reduction variables. + if (BB != Header) { + // Non-header phi nodes that have outside uses can be vectorized. Add + // them to the list of allowed exits. + // Unsafe cyclic dependencies with header phis are identified during + // legalization for reduction, induction and fixed order + // recurrences. + AllowedExit.insert(&I); + return true; + } + + // We only allow if-converted PHIs with exactly two incoming values. + if (Phi->getNumIncomingValues() != 2) { + reportVectorizationFailure( + "Found an invalid PHI", + "loop control flow is not understood by vectorizer", + "CFGNotUnderstood", ORE, TheLoop, Phi); + return false; + } + + RecurrenceDescriptor RedDes; + if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, DT, + PSE.getSE())) { + Requirements->addExactFPMathInst(RedDes.getExactFPMathInst()); + AllowedExit.insert(RedDes.getLoopExitInstr()); + Reductions[Phi] = RedDes; + return true; + } + + // We prevent matching non-constant strided pointer IVS to preserve + // historical vectorizer behavior after a generalization of the + // IVDescriptor code. The intent is to remove this check, but we + // have to fix issues around code quality for such loops first. + auto IsDisallowedStridedPointerInduction = + [](const InductionDescriptor &ID) { + if (AllowStridedPointerIVs) + return false; + return ID.getKind() == InductionDescriptor::IK_PtrInduction && + ID.getConstIntStepValue() == nullptr; + }; + + // TODO: Instead of recording the AllowedExit, it would be good to + // record the complementary set: NotAllowedExit. These include (but may + // not be limited to): + // 1. Reduction phis as they represent the one-before-last value, which + // is not available when vectorized + // 2. Induction phis and increment when SCEV predicates cannot be used + // outside the loop - see addInductionPhi + // 3. Non-Phis with outside uses when SCEV predicates cannot be used + // outside the loop - see call to hasOutsideLoopUser in the non-phi + // handling below + // 4. FixedOrderRecurrence phis that can possibly be handled by + // extraction. + // By recording these, we can then reason about ways to vectorize each + // of these NotAllowedExit. + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) && + !IsDisallowedStridedPointerInduction(ID)) { + addInductionPhi(Phi, ID, AllowedExit); + Requirements->addExactFPMathInst(ID.getExactFPMathInst()); + return true; + } + + if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) { + AllowedExit.insert(Phi); + FixedOrderRecurrences.insert(Phi); + return true; + } + + // As a last resort, coerce the PHI to a AddRec expression + // and re-try classifying it a an induction PHI. + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) && + !IsDisallowedStridedPointerInduction(ID)) { + addInductionPhi(Phi, ID, AllowedExit); + return true; + } + + reportVectorizationFailure("Found an unidentified PHI", + "value that could not be identified as " + "reduction is used outside the loop", + "NonReductionValueUsedOutsideLoop", ORE, TheLoop, + Phi); + return false; + } // end of PHI handling + + // We handle calls that: + // * Have a mapping to an IR intrinsic. + // * Have a vector version available. + auto *CI = dyn_cast(&I); + + if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && + !(CI->getCalledFunction() && TLI && + (!VFDatabase::getMappings(*CI).empty() || isTLIScalarize(*TLI, *CI)))) { + // If the call is a recognized math libary call, it is likely that + // we can vectorize it given loosened floating-point constraints. + LibFunc Func; + bool IsMathLibCall = + TLI && CI->getCalledFunction() && CI->getType()->isFloatingPointTy() && + TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) && + TLI->hasOptimizedCodeGen(Func); + + if (IsMathLibCall) { + // TODO: Ideally, we should not use clang-specific language here, + // but it's hard to provide meaningful yet generic advice. + // Also, should this be guarded by allowExtraAnalysis() and/or be part + // of the returned info from isFunctionVectorizable()? + reportVectorizationFailure( + "Found a non-intrinsic callsite", + "library call cannot be vectorized. " + "Try compiling with -fno-math-errno, -ffast-math, " + "or similar flags", + "CantVectorizeLibcall", ORE, TheLoop, CI); + } else { + reportVectorizationFailure("Found a non-intrinsic callsite", + "call instruction cannot be vectorized", + "CantVectorizeLibcall", ORE, TheLoop, CI); + } + return false; + } + + // Some intrinsics have scalar arguments and should be same in order for + // them to be vectorized (i.e. loop invariant). + if (CI) { + auto *SE = PSE.getSE(); + Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); + for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx) + if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx, TTI)) { + if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)), TheLoop)) { + reportVectorizationFailure( + "Found unvectorizable intrinsic", + "intrinsic instruction cannot be vectorized", + "CantVectorizeIntrinsic", ORE, TheLoop, CI); + return false; + } + } + } + + // If we found a vectorized variant of a function, note that so LV can + // make better decisions about maximum VF. + if (CI && !VFDatabase::getMappings(*CI).empty()) + VecCallVariantsFound = true; + + auto CanWidenInstructionTy = [](Instruction const &Inst) { + Type *InstTy = Inst.getType(); + if (!isa(InstTy)) + return canVectorizeTy(InstTy); + + // For now, we only recognize struct values returned from calls where + // all users are extractvalue as vectorizable. All element types of the + // struct must be types that can be widened. + return isa(Inst) && canWidenCallReturnType(InstTy) && + all_of(Inst.users(), IsaPred); + }; + + // Check that the instruction return type is vectorizable. + // We can't vectorize casts from vector type to scalar type. + // Also, we can't vectorize extractelement instructions. + if (!CanWidenInstructionTy(I) || + (isa(I) && + !VectorType::isValidElementType(I.getOperand(0)->getType())) || + isa(I)) { + reportVectorizationFailure("Found unvectorizable type", + "instruction return type cannot be vectorized", + "CantVectorizeInstructionReturnType", ORE, + TheLoop, &I); + return false; + } + + // Check that the stored type is vectorizable. + if (auto *ST = dyn_cast(&I)) { + Type *T = ST->getValueOperand()->getType(); + if (!VectorType::isValidElementType(T)) { + reportVectorizationFailure("Store instruction cannot be vectorized", + "CantVectorizeStore", ORE, TheLoop, ST); + return false; + } + + // For nontemporal stores, check that a nontemporal vector version is + // supported on the target. + if (ST->getMetadata(LLVMContext::MD_nontemporal)) { + // Arbitrarily try a vector of 2 elements. + auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2); + assert(VecTy && "did not find vectorized version of stored type"); + if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) { + reportVectorizationFailure( + "nontemporal store instruction cannot be vectorized", + "CantVectorizeNontemporalStore", ORE, TheLoop, ST); + return false; + } + } + + } else if (auto *LD = dyn_cast(&I)) { + if (LD->getMetadata(LLVMContext::MD_nontemporal)) { + // For nontemporal loads, check that a nontemporal vector version is + // supported on the target (arbitrarily try a vector of 2 elements). + auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2); + assert(VecTy && "did not find vectorized version of load type"); + if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) { + reportVectorizationFailure( + "nontemporal load instruction cannot be vectorized", + "CantVectorizeNontemporalLoad", ORE, TheLoop, LD); + return false; + } + } + + // FP instructions can allow unsafe algebra, thus vectorizable by + // non-IEEE-754 compliant SIMD units. + // This applies to floating-point math operations and calls, not memory + // operations, shuffles, or casts, as they don't change precision or + // semantics. + } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && + !I.isFast()) { + LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); + Hints->setPotentiallyUnsafe(); + } + + // Reduction instructions are allowed to have exit users. + // All other instructions must not have external users. + if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { + // We can safely vectorize loops where instructions within the loop are + // used outside the loop only if the SCEV predicates within the loop is + // same as outside the loop. Allowing the exit means reusing the SCEV + // outside the loop. + if (PSE.getPredicate().isAlwaysTrue()) { + AllowedExit.insert(&I); + return true; + } + reportVectorizationFailure("Value cannot be used outside the loop", + "ValueUsedOutsideLoop", ORE, TheLoop, &I); + return false; + } + return true; } diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll index 70134fa6bc78..5ec093c5af6b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll @@ -117,6 +117,33 @@ ; YAML-NEXT: ... ; YAML-NEXT: --- !Analysis ; YAML-NEXT: Pass: loop-vectorize +; YAML-NEXT: Name: NonReductionValueUsedOutsideLoop +; YAML-NEXT: DebugLoc: { File: source.cpp, Line: 27, Column: 3 } +; YAML-NEXT: Function: test_multiple_failures +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'loop not vectorized: ' +; YAML-NEXT: - String: value that could not be identified as reduction is used outside the loop +; YAML-NEXT: ... +; YAML-NEXT: --- !Analysis +; YAML-NEXT: Pass: loop-vectorize +; YAML-NEXT: Name: CantVectorizeLibcall +; YAML-NEXT: DebugLoc: { File: source.cpp, Line: 29, Column: 11 } +; YAML-NEXT: Function: test_multiple_failures +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'loop not vectorized: ' +; YAML-NEXT: - String: call instruction cannot be vectorized +; YAML-NEXT: ... +; YAML-NEXT: --- !Analysis +; YAML-NEXT: Pass: loop-vectorize +; YAML-NEXT: Name: NoInductionVariable +; YAML-NEXT: DebugLoc: { File: source.cpp, Line: 27, Column: 3 } +; YAML-NEXT: Function: test_multiple_failures +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'loop not vectorized: ' +; YAML-NEXT: - String: loop induction variable could not be identified +; YAML-NEXT: ... +; YAML-NEXT: --- !Analysis +; YAML-NEXT: Pass: loop-vectorize ; YAML-NEXT: Name: UnsupportedUncountableLoop ; YAML-NEXT: DebugLoc: { File: source.cpp, Line: 27, Column: 3 } ; YAML-NEXT: Function: test_multiple_failures @@ -124,6 +151,15 @@ ; YAML-NEXT: - String: 'loop not vectorized: ' ; YAML-NEXT: - String: Cannot vectorize uncountable loop ; YAML-NEXT: ... +; YAML-NEXT: --- !Analysis +; YAML-NEXT: Pass: loop-vectorize +; YAML-NEXT: Name: CantComputeNumberOfIterations +; YAML-NEXT: DebugLoc: { File: source.cpp, Line: 27, Column: 3 } +; YAML-NEXT: Function: test_multiple_failures +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'loop not vectorized: ' +; YAML-NEXT: - String: could not determine number of loop iterations +; YAML-NEXT: ... ; YAML: --- !Missed ; YAML-NEXT: Pass: loop-vectorize ; YAML-NEXT: Name: MissedDetails From 4eb1a07d7d1a9722e84490b0ff79d3ae5e260f76 Mon Sep 17 00:00:00 2001 From: Yang Bai Date: Tue, 19 Aug 2025 01:09:12 +0800 Subject: [PATCH 05/27] [mlir][vector] Support multi-dimensional vectors in VectorFromElementsLowering (#151175) This patch introduces a new unrolling-based approach for lowering multi-dimensional `vector.from_elements` operations. **Implementation Details:** 1. **New Transform Pattern**: Added `UnrollFromElements` that unrolls a N-D(N>=2) from_elements op to a (N-1)-D from_elements op align the outermost dimension. 2. **Utility Functions**: Added `unrollVectorOp` to reuse the unroll algo of vector.gather for vector.from_elements. 3. **Integration**: Added the unrolling pattern to the convert-vector-to-llvm pass as a temporal transformation. 4. Use direct LLVM dialect operations instead of intermediate vector.insert operations for efficiency in `VectorFromElementsLowering`. **Example:** ```mlir // unroll %v = vector.from_elements %e0, %e1, %e2, %e3 : vector<2x2xf32> => %poison_2d = ub.poison : vector<2x2xf32> %vec_1d_0 = vector.from_elements %e0, %e1 : vector<2xf32> %vec_2d_0 = vector.insert %vec_1d_0, %poison_2d [0] : vector<2xf32> into vector<2x2xf32> %vec_1d_1 = vector.from_elements %e2, %e3 : vector<2xf32> %result = vector.insert %vec_1d_1, %vec_2d_0 [1] : vector<2xf32> into vector<2x2xf32> // convert-vector-to-llvm %v = vector.from_elements %e0, %e1, %e2, %e3 : vector<2x2xf32> => %poison_2d = ub.poison : vector<2x2xf32> %poison_2d_cast = builtin.unrealized_conversion_cast %poison_2d : vector<2x2xf32> to !llvm.array<2 x vector<2xf32>> %poison_1d_0 = llvm.mlir.poison : vector<2xf32> %c0_0 = llvm.mlir.constant(0 : i64) : i64 %vec_1d_0_0 = llvm.insertelement %e0, %poison_1d_0[%c0_0 : i64] : vector<2xf32> %c1_0 = llvm.mlir.constant(1 : i64) : i64 %vec_1d_0_1 = llvm.insertelement %e1, %vec_1d_0_0[%c1_0 : i64] : vector<2xf32> %vec_2d_0 = llvm.insertvalue %vec_1d_0_1, %poison_2d_cast[0] : !llvm.array<2 x vector<2xf32>> %poison_1d_1 = llvm.mlir.poison : vector<2xf32> %c0_1 = llvm.mlir.constant(0 : i64) : i64 %vec_1d_1_0 = llvm.insertelement %e2, %poison_1d_1[%c0_1 : i64] : vector<2xf32> %c1_1 = llvm.mlir.constant(1 : i64) : i64 %vec_1d_1_1 = llvm.insertelement %e3, %vec_1d_1_0[%c1_1 : i64] : vector<2xf32> %vec_2d_1 = llvm.insertvalue %vec_1d_1_1, %vec_2d_0[1] : !llvm.array<2 x vector<2xf32>> %result = builtin.unrealized_conversion_cast %vec_2d_1 : !llvm.array<2 x vector<2xf32>> to vector<2x2xf32> ``` --------- Co-authored-by: Nicolas Vasilache Co-authored-by: Yang Bai Co-authored-by: James Newling Co-authored-by: Diego Caballero --- .../Vector/TransformOps/VectorTransformOps.td | 11 ++++ .../Vector/Transforms/LoweringPatterns.h | 8 +++ .../mlir/Dialect/Vector/Utils/VectorUtils.h | 17 +++++ .../VectorToLLVM/ConvertVectorToLLVM.cpp | 14 ++-- .../VectorToLLVM/ConvertVectorToLLVMPass.cpp | 1 + .../TransformOps/VectorTransformOps.cpp | 5 ++ .../Dialect/Vector/Transforms/CMakeLists.txt | 1 + .../Transforms/LowerVectorFromElements.cpp | 65 +++++++++++++++++++ .../Vector/Transforms/LowerVectorGather.cpp | 33 +++------- mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp | 26 ++++++++ .../VectorToLLVM/vector-to-llvm.mlir | 37 +++++++++++ .../Vector/vector-from-elements-lowering.mlir | 45 +++++++++++++ .../Vector/vector-gather-lowering.mlir | 2 +- .../Dialect/Vector/TestVectorTransforms.cpp | 24 +++++++ .../python/dialects/transform_vector_ext.py | 2 + 15 files changed, 261 insertions(+), 30 deletions(-) create mode 100644 mlir/lib/Dialect/Vector/Transforms/LowerVectorFromElements.cpp create mode 100644 mlir/test/Dialect/Vector/vector-from-elements-lowering.mlir diff --git a/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td b/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td index 299f198e4ab9..07a4117a37b2 100644 --- a/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td +++ b/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td @@ -254,6 +254,17 @@ def ApplyLowerGatherPatternsOp : Op]> { + let description = [{ + Indicates that vector from_elements operations should be unrolled + along the outermost dimension. + }]; + + let assemblyFormat = "attr-dict"; +} + def ApplyLowerScanPatternsOp : Op]> { diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h index e03f0dabece5..47f96112a943 100644 --- a/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h +++ b/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h @@ -303,6 +303,14 @@ void populateVectorRankReducingFMAPattern(RewritePatternSet &patterns); void populateVectorToFromElementsToShuffleTreePatterns( RewritePatternSet &patterns, PatternBenefit benefit = 1); +/// Populate the pattern set with the following patterns: +/// +/// [UnrollFromElements] +/// Unrolls 2 or more dimensional `vector.from_elements` ops by unrolling the +/// outermost dimension. +void populateVectorFromElementsLoweringPatterns(RewritePatternSet &patterns, + PatternBenefit benefit = 1); + /// Populate the pattern set with the following patterns: /// /// [ContractionOpToMatmulOpLowering] diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h index 8bd54cf31b89..ace26990601c 100644 --- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h +++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h @@ -12,6 +12,7 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/UB/IR/UBOps.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/BuiltinAttributes.h" @@ -238,6 +239,22 @@ Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source, /// static sizes in `shape`. LogicalResult isValidMaskedInputVector(ArrayRef shape, ArrayRef inputVectorSizes); + +/// Generic utility for unrolling n-D vector operations to (n-1)-D operations. +/// This handles the common pattern of: +/// 1. Check if already 1-D. If so, return failure. +/// 2. Check for scalable dimensions. If so, return failure. +/// 3. Create poison initialized result. +/// 4. Loop through the outermost dimension, execute the UnrollVectorOpFn to +/// create sub vectors. +/// 5. Insert the sub vectors back into the final vector. +/// 6. Replace the original op with the new result. +using UnrollVectorOpFn = + function_ref; + +LogicalResult unrollVectorOp(Operation *op, PatternRewriter &rewriter, + UnrollVectorOpFn unrollFn); + } // namespace vector /// Constructs a permutation map of invariant memref indices to vector diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index f9e2a01dbf96..afc3d1b12ac0 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -1891,15 +1891,21 @@ struct VectorFromElementsLowering ConversionPatternRewriter &rewriter) const override { Location loc = fromElementsOp.getLoc(); VectorType vectorType = fromElementsOp.getType(); - // TODO: Multi-dimensional vectors lower to !llvm.array<... x vector<>>. - // Such ops should be handled in the same way as vector.insert. + // Only support 1-D vectors. Multi-dimensional vectors should have been + // transformed to 1-D vectors by the vector-to-vector transformations before + // this. if (vectorType.getRank() > 1) return rewriter.notifyMatchFailure(fromElementsOp, "rank > 1 vectors are not supported"); Type llvmType = typeConverter->convertType(vectorType); + Type llvmIndexType = typeConverter->convertType(rewriter.getIndexType()); Value result = LLVM::PoisonOp::create(rewriter, loc, llvmType); - for (auto [idx, val] : llvm::enumerate(adaptor.getElements())) - result = vector::InsertOp::create(rewriter, loc, val, result, idx); + for (auto [idx, val] : llvm::enumerate(adaptor.getElements())) { + auto constIdx = + LLVM::ConstantOp::create(rewriter, loc, llvmIndexType, idx); + result = LLVM::InsertElementOp::create(rewriter, loc, llvmType, result, + val, constIdx); + } rewriter.replaceOp(fromElementsOp, result); return success(); } diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp index cf108690c374..9852df6970fd 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp @@ -94,6 +94,7 @@ void ConvertVectorToLLVMPass::runOnOperation() { populateVectorStepLoweringPatterns(patterns); populateVectorRankReducingFMAPattern(patterns); populateVectorGatherLoweringPatterns(patterns); + populateVectorFromElementsLoweringPatterns(patterns); if (armI8MM) { if (armNeon) arm_neon::populateLowerContractionToNeonI8MMPatterns(patterns); diff --git a/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp b/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp index 2d5cc070558c..fe066dc04ad5 100644 --- a/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp +++ b/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp @@ -139,6 +139,11 @@ void transform::ApplyLowerGatherPatternsOp::populatePatterns( vector::populateVectorGatherLoweringPatterns(patterns); } +void transform::ApplyUnrollFromElementsPatternsOp::populatePatterns( + RewritePatternSet &patterns) { + vector::populateVectorFromElementsLoweringPatterns(patterns); +} + void transform::ApplyLowerScanPatternsOp::populatePatterns( RewritePatternSet &patterns) { vector::populateVectorScanLoweringPatterns(patterns); diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt index 9e287fc10999..acbf2b746037 100644 --- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt @@ -3,6 +3,7 @@ add_mlir_dialect_library(MLIRVectorTransforms LowerVectorBitCast.cpp LowerVectorBroadcast.cpp LowerVectorContract.cpp + LowerVectorFromElements.cpp LowerVectorGather.cpp LowerVectorInterleave.cpp LowerVectorMask.cpp diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorFromElements.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorFromElements.cpp new file mode 100644 index 000000000000..c22fd54cef46 --- /dev/null +++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorFromElements.cpp @@ -0,0 +1,65 @@ +//===- LowerVectorFromElements.cpp - Lower 'vector.from_elements' op -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements target-independent rewrites and utilities to lower the +// 'vector.from_elements' operation. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" + +#define DEBUG_TYPE "lower-vector-from-elements" + +using namespace mlir; + +namespace { + +/// Unrolls 2 or more dimensional `vector.from_elements` ops by unrolling the +/// outermost dimension. For example: +/// ``` +/// %v = vector.from_elements %e0, %e1, %e2, %e3, %e4, %e5 : vector<2x3xf32> +/// +/// ==> +/// +/// %0 = ub.poison : vector<2x3xf32> +/// %v0 = vector.from_elements %e0, %e1, %e2 : vector<3xf32> +/// %1 = vector.insert %v0, %0 [0] : vector<3xf32> into vector<2x3xf32> +/// %v1 = vector.from_elements %e3, %e4, %e5 : vector<3xf32> +/// %v = vector.insert %v1, %1 [1] : vector<3xf32> into vector<2x3xf32> +/// ``` +/// +/// When applied exhaustively, this will produce a sequence of 1-d from_elements +/// ops. +struct UnrollFromElements : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(vector::FromElementsOp op, + PatternRewriter &rewriter) const override { + ValueRange allElements = op.getElements(); + + auto unrollFromElementsFn = [&](PatternRewriter &rewriter, Location loc, + VectorType subTy, int64_t index) { + size_t subTyNumElements = subTy.getNumElements(); + assert((index + 1) * subTyNumElements <= allElements.size() && + "out of bounds"); + ValueRange subElements = + allElements.slice(index * subTyNumElements, subTyNumElements); + return vector::FromElementsOp::create(rewriter, loc, subTy, subElements); + }; + + return unrollVectorOp(op, rewriter, unrollFromElementsFn); + } +}; + +} // namespace + +void mlir::vector::populateVectorFromElementsLoweringPatterns( + RewritePatternSet &patterns, PatternBenefit benefit) { + patterns.add(patterns.getContext(), benefit); +} diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp index e062f55f8767..90f21c53246b 100644 --- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp @@ -54,27 +54,13 @@ struct UnrollGather : OpRewritePattern { LogicalResult matchAndRewrite(vector::GatherOp op, PatternRewriter &rewriter) const override { - VectorType resultTy = op.getType(); - if (resultTy.getRank() < 2) - return rewriter.notifyMatchFailure(op, "already 1-D"); - - // Unrolling doesn't take vscale into account. Pattern is disabled for - // vectors with leading scalable dim(s). - if (resultTy.getScalableDims().front()) - return rewriter.notifyMatchFailure(op, "cannot unroll scalable dim"); - - Location loc = op.getLoc(); Value indexVec = op.getIndexVec(); Value maskVec = op.getMask(); Value passThruVec = op.getPassThru(); - Value result = arith::ConstantOp::create(rewriter, loc, resultTy, - rewriter.getZeroAttr(resultTy)); - - VectorType subTy = VectorType::Builder(resultTy).dropDim(0); - - for (int64_t i = 0, e = resultTy.getShape().front(); i < e; ++i) { - int64_t thisIdx[1] = {i}; + auto unrollGatherFn = [&](PatternRewriter &rewriter, Location loc, + VectorType subTy, int64_t index) { + int64_t thisIdx[1] = {index}; Value indexSubVec = vector::ExtractOp::create(rewriter, loc, indexVec, thisIdx); @@ -82,15 +68,12 @@ struct UnrollGather : OpRewritePattern { vector::ExtractOp::create(rewriter, loc, maskVec, thisIdx); Value passThruSubVec = vector::ExtractOp::create(rewriter, loc, passThruVec, thisIdx); - Value subGather = vector::GatherOp::create( - rewriter, loc, subTy, op.getBase(), op.getIndices(), indexSubVec, - maskSubVec, passThruSubVec); - result = - vector::InsertOp::create(rewriter, loc, subGather, result, thisIdx); - } + return vector::GatherOp::create(rewriter, loc, subTy, op.getBase(), + op.getIndices(), indexSubVec, maskSubVec, + passThruSubVec); + }; - rewriter.replaceOp(op, result); - return success(); + return unrollVectorOp(op, rewriter, unrollGatherFn); } }; diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp index 6e2fa35e1279..841e1384e03b 100644 --- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp +++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp @@ -392,3 +392,29 @@ vector::isValidMaskedInputVector(ArrayRef shape, } return success(); } + +LogicalResult vector::unrollVectorOp(Operation *op, PatternRewriter &rewriter, + vector::UnrollVectorOpFn unrollFn) { + assert(op->getNumResults() == 1 && "expected single result"); + assert(isa(op->getResult(0).getType()) && "expected vector type"); + VectorType resultTy = cast(op->getResult(0).getType()); + if (resultTy.getRank() < 2) + return rewriter.notifyMatchFailure(op, "already 1-D"); + + // Unrolling doesn't take vscale into account. Pattern is disabled for + // vectors with leading scalable dim(s). + if (resultTy.getScalableDims().front()) + return rewriter.notifyMatchFailure(op, "cannot unroll scalable dim"); + + Location loc = op->getLoc(); + Value result = ub::PoisonOp::create(rewriter, loc, resultTy); + VectorType subTy = VectorType::Builder(resultTy).dropDim(0); + + for (int64_t i = 0, e = resultTy.getShape().front(); i < e; ++i) { + Value subVector = unrollFn(rewriter, loc, subTy, i); + result = vector::InsertOp::create(rewriter, loc, subVector, result, i); + } + + rewriter.replaceOp(op, result); + return success(); +} diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir index 72810b5dddaa..07d335117de0 100644 --- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir +++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir @@ -1737,3 +1737,40 @@ func.func @step() -> vector<4xindex> { %0 = vector.step : vector<4xindex> return %0 : vector<4xindex> } + + +// ----- + +//===----------------------------------------------------------------------===// +// vector.from_elements +//===----------------------------------------------------------------------===// + +// NOTE: We unroll multi-dimensional from_elements ops with pattern `UnrollFromElements` +// and then convert the 1-D from_elements ops to llvm. + +// CHECK-LABEL: func @from_elements_3d +// CHECK-SAME: %[[ARG_0:.*]]: f32, %[[ARG_1:.*]]: f32, %[[ARG_2:.*]]: f32, %[[ARG_3:.*]]: f32) +// CHECK: %[[UNDEF_RES:.*]] = ub.poison : vector<2x1x2xf32> +// CHECK: %[[UNDEF_RES_LLVM:.*]] = builtin.unrealized_conversion_cast %[[UNDEF_RES]] : vector<2x1x2xf32> to !llvm.array<2 x array<1 x vector<2xf32>>> +// CHECK: %[[UNDEF_VEC_RANK_2:.*]] = ub.poison : vector<1x2xf32> +// CHECK: %[[UNDEF_VEC_RANK_2_LLVM:.*]] = builtin.unrealized_conversion_cast %[[UNDEF_VEC_RANK_2]] : vector<1x2xf32> to !llvm.array<1 x vector<2xf32>> +// CHECK: %[[UNDEF_VEC0:.*]] = llvm.mlir.poison : vector<2xf32> +// CHECK: %[[C0_0:.*]] = llvm.mlir.constant(0 : i64) : i64 +// CHECK: %[[VEC0_0:.*]] = llvm.insertelement %[[ARG_0]], %[[UNDEF_VEC0]][%[[C0_0]] : i64] : vector<2xf32> +// CHECK: %[[C1_0:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[VEC0_1:.*]] = llvm.insertelement %[[ARG_1]], %[[VEC0_0]][%[[C1_0]] : i64] : vector<2xf32> +// CHECK: %[[RES_RANK_2_0:.*]] = llvm.insertvalue %[[VEC0_1]], %[[UNDEF_VEC_RANK_2_LLVM]][0] : !llvm.array<1 x vector<2xf32>> +// CHECK: %[[RES_0:.*]] = llvm.insertvalue %[[RES_RANK_2_0]], %[[UNDEF_RES_LLVM]][0] : !llvm.array<2 x array<1 x vector<2xf32>>> +// CHECK: %[[UNDEF_VEC1:.*]] = llvm.mlir.poison : vector<2xf32> +// CHECK: %[[C0_1:.*]] = llvm.mlir.constant(0 : i64) : i64 +// CHECK: %[[VEC1_0:.*]] = llvm.insertelement %[[ARG_2]], %[[UNDEF_VEC1]][%[[C0_1]] : i64] : vector<2xf32> +// CHECK: %[[C1_1:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[VEC1_1:.*]] = llvm.insertelement %[[ARG_3]], %[[VEC1_0]][%[[C1_1]] : i64] : vector<2xf32> +// CHECK: %[[RES_RANK_2_1:.*]] = llvm.insertvalue %[[VEC1_1]], %[[UNDEF_VEC_RANK_2_LLVM]][0] : !llvm.array<1 x vector<2xf32>> +// CHECK: %[[RES_1:.*]] = llvm.insertvalue %[[RES_RANK_2_1]], %[[RES_0]][1] : !llvm.array<2 x array<1 x vector<2xf32>>> +// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[RES_1]] : !llvm.array<2 x array<1 x vector<2xf32>>> to vector<2x1x2xf32> +// CHECK: return %[[CAST]] +func.func @from_elements_3d(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32) -> vector<2x1x2xf32> { + %0 = vector.from_elements %arg0, %arg1, %arg2, %arg3 : vector<2x1x2xf32> + return %0 : vector<2x1x2xf32> +} diff --git a/mlir/test/Dialect/Vector/vector-from-elements-lowering.mlir b/mlir/test/Dialect/Vector/vector-from-elements-lowering.mlir new file mode 100644 index 000000000000..8fac608ed569 --- /dev/null +++ b/mlir/test/Dialect/Vector/vector-from-elements-lowering.mlir @@ -0,0 +1,45 @@ +// RUN: mlir-opt %s -test-unroll-vector-from-elements | FileCheck %s --check-prefix=CHECK-UNROLL + +//===----------------------------------------------------------------------===// +// Test UnrollFromElements. +//===----------------------------------------------------------------------===// + +// CHECK-UNROLL-LABEL: @unroll_from_elements_2d +// CHECK-UNROLL-SAME: (%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32, %[[ARG2:.*]]: f32, %[[ARG3:.*]]: f32) +// CHECK-UNROLL-NEXT: %[[UNDEF_RES:.*]] = ub.poison : vector<2x2xf32> +// CHECK-UNROLL-NEXT: %[[VEC_0:.*]] = vector.from_elements %[[ARG0]], %[[ARG1]] : vector<2xf32> +// CHECK-UNROLL-NEXT: %[[RES_0:.*]] = vector.insert %[[VEC_0]], %[[UNDEF_RES]] [0] : vector<2xf32> into vector<2x2xf32> +// CHECK-UNROLL-NEXT: %[[VEC_1:.*]] = vector.from_elements %[[ARG2]], %[[ARG3]] : vector<2xf32> +// CHECK-UNROLL-NEXT: %[[RES_1:.*]] = vector.insert %[[VEC_1]], %[[RES_0]] [1] : vector<2xf32> into vector<2x2xf32> +// CHECK-UNROLL-NEXT: return %[[RES_1]] : vector<2x2xf32> +func.func @unroll_from_elements_2d(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32) -> vector<2x2xf32> { + %0 = vector.from_elements %arg0, %arg1, %arg2, %arg3 : vector<2x2xf32> + return %0 : vector<2x2xf32> +} + +// CHECK-UNROLL-LABEL: @unroll_from_elements_3d +// CHECK-UNROLL-SAME: (%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32, %[[ARG2:.*]]: f32, %[[ARG3:.*]]: f32) +// CHECK-UNROLL-NEXT: %[[UNDEF_RES:.*]] = ub.poison : vector<2x1x2xf32> +// CHECK-UNROLL-NEXT: %[[UNDEF_RANK_2:.*]] = ub.poison : vector<1x2xf32> +// CHECK-UNROLL-NEXT: %[[VEC_0:.*]] = vector.from_elements %[[ARG0]], %[[ARG1]] : vector<2xf32> +// CHECK-UNROLL-NEXT: %[[RANK_2_0:.*]] = vector.insert %[[VEC_0]], %[[UNDEF_RANK_2]] [0] : vector<2xf32> into vector<1x2xf32> +// CHECK-UNROLL-NEXT: %[[RES_0:.*]] = vector.insert %[[RANK_2_0]], %[[UNDEF_RES]] [0] : vector<1x2xf32> into vector<2x1x2xf32> +// CHECK-UNROLL-NEXT: %[[VEC_1:.*]] = vector.from_elements %[[ARG2]], %[[ARG3]] : vector<2xf32> +// CHECK-UNROLL-NEXT: %[[RANK_2_1:.*]] = vector.insert %[[VEC_1]], %[[UNDEF_RANK_2]] [0] : vector<2xf32> into vector<1x2xf32> +// CHECK-UNROLL-NEXT: %[[RES_1:.*]] = vector.insert %[[RANK_2_1]], %[[RES_0]] [1] : vector<1x2xf32> into vector<2x1x2xf32> +// CHECK-UNROLL-NEXT: return %[[RES_1]] : vector<2x1x2xf32> +func.func @unroll_from_elements_3d(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32) -> vector<2x1x2xf32> { + %0 = vector.from_elements %arg0, %arg1, %arg2, %arg3 : vector<2x1x2xf32> + return %0 : vector<2x1x2xf32> +} + +// 1-D vector.from_elements should not be unrolled. + +// CHECK-UNROLL-LABEL: @negative_unroll_from_elements_1d +// CHECK-UNROLL-SAME: (%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32) +// CHECK-UNROLL-NEXT: %[[RES:.*]] = vector.from_elements %[[ARG0]], %[[ARG1]] : vector<2xf32> +// CHECK-UNROLL-NEXT: return %[[RES]] : vector<2xf32> +func.func @negative_unroll_from_elements_1d(%arg0: f32, %arg1: f32) -> vector<2xf32> { + %0 = vector.from_elements %arg0, %arg1 : vector<2xf32> + return %0 : vector<2xf32> +} diff --git a/mlir/test/Dialect/Vector/vector-gather-lowering.mlir b/mlir/test/Dialect/Vector/vector-gather-lowering.mlir index 5be267c1be98..9c2a508671e0 100644 --- a/mlir/test/Dialect/Vector/vector-gather-lowering.mlir +++ b/mlir/test/Dialect/Vector/vector-gather-lowering.mlir @@ -81,7 +81,7 @@ func.func @gather_memref_1d_i32_index(%base: memref, %v: vector<2xi32>, % // CHECK-SAME: %[[PASS:.*]]: vector<2x[3]xf32> // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[INIT:.*]] = arith.constant dense<0.000000e+00> : vector<2x[3]xf32> +// CHECK: %[[INIT:.*]] = ub.poison : vector<2x[3]xf32> // CHECK: %[[IDXVEC0:.*]] = vector.extract %[[IDXVEC]][0] : vector<[3]xindex> from vector<2x[3]xindex> // CHECK: %[[MASK0:.*]] = vector.extract %[[MASK]][0] : vector<[3]xi1> from vector<2x[3]xi1> // CHECK: %[[PASS0:.*]] = vector.extract %[[PASS]][0] : vector<[3]xf32> from vector<2x[3]xf32> diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp index f89c944b5c56..bb1598ee3efe 100644 --- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp +++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp @@ -786,6 +786,28 @@ struct TestVectorGatherLowering } }; +struct TestUnrollVectorFromElements + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestUnrollVectorFromElements) + + StringRef getArgument() const final { + return "test-unroll-vector-from-elements"; + } + StringRef getDescription() const final { + return "Test unrolling patterns for from_elements ops"; + } + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + populateVectorFromElementsLoweringPatterns(patterns); + (void)applyPatternsGreedily(getOperation(), std::move(patterns)); + } +}; + struct TestFoldArithExtensionIntoVectorContractPatterns : public PassWrapper> { @@ -1059,6 +1081,8 @@ void registerTestVectorLowerings() { PassRegistration(); + PassRegistration(); + PassRegistration(); PassRegistration(); diff --git a/mlir/test/python/dialects/transform_vector_ext.py b/mlir/test/python/dialects/transform_vector_ext.py index a51f2154d1f7..5a648fe07331 100644 --- a/mlir/test/python/dialects/transform_vector_ext.py +++ b/mlir/test/python/dialects/transform_vector_ext.py @@ -46,6 +46,8 @@ def non_configurable_patterns(): vector.ApplyLowerOuterProductPatternsOp() # CHECK: transform.apply_patterns.vector.lower_gather vector.ApplyLowerGatherPatternsOp() + # CHECK: transform.apply_patterns.vector.unroll_from_elements + vector.ApplyUnrollFromElementsPatternsOp() # CHECK: transform.apply_patterns.vector.lower_scan vector.ApplyLowerScanPatternsOp() # CHECK: transform.apply_patterns.vector.lower_shape_cast From c2e7fad44691ed44281bde9e8322e70be0e6aeec Mon Sep 17 00:00:00 2001 From: Panagiotis Karouzakis <45971450+karouzakisp@users.noreply.github.com> Date: Mon, 18 Aug 2025 20:11:16 +0300 Subject: [PATCH 06/27] [DemandedBits] Support non-constant shift amounts (#148880) This patch adds support for the shift operators to handle non-constant shift operands. ashr proof -->https://alive2.llvm.org/ce/z/EN-siK lshr proof --> https://alive2.llvm.org/ce/z/eeGzyB shl proof --> https://alive2.llvm.org/ce/z/dpvbkq --- llvm/lib/Analysis/DemandedBits.cpp | 69 +++++++++ llvm/test/Analysis/DemandedBits/ashr.ll | 198 ++++++++++++++++++++++++ llvm/test/Analysis/DemandedBits/lshr.ll | 198 ++++++++++++++++++++++++ llvm/test/Analysis/DemandedBits/shl.ll | 134 +++++++++++++++- 4 files changed, 598 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Analysis/DemandedBits/ashr.ll create mode 100644 llvm/test/Analysis/DemandedBits/lshr.ll diff --git a/llvm/lib/Analysis/DemandedBits.cpp b/llvm/lib/Analysis/DemandedBits.cpp index 6694d5cc06c8..e0881751aef7 100644 --- a/llvm/lib/Analysis/DemandedBits.cpp +++ b/llvm/lib/Analysis/DemandedBits.cpp @@ -76,6 +76,26 @@ void DemandedBits::determineLiveOperandBits( computeKnownBits(V2, Known2, DL, &AC, UserI, &DT); } }; + auto GetShiftedRange = [&](uint64_t Min, uint64_t Max, bool ShiftLeft) { + auto ShiftF = [ShiftLeft](const APInt &Mask, unsigned ShiftAmnt) { + return ShiftLeft ? Mask.shl(ShiftAmnt) : Mask.lshr(ShiftAmnt); + }; + AB = APInt::getZero(BitWidth); + uint64_t LoopRange = Max - Min; + APInt Mask = AOut; + APInt Shifted = AOut; // AOut | (AOut << 1) | ... | (AOut << (ShiftAmnt - 1) + for (unsigned ShiftAmnt = 1; ShiftAmnt <= LoopRange; ShiftAmnt <<= 1) { + if (LoopRange & ShiftAmnt) { + // Account for (LoopRange - ShiftAmnt, LoopRange] + Mask |= ShiftF(Shifted, LoopRange - ShiftAmnt + 1); + // Clears the low bit. + LoopRange -= ShiftAmnt; + } + // [0, ShiftAmnt) -> [0, ShiftAmnt * 2) + Shifted |= ShiftF(Shifted, ShiftAmnt); + } + AB = ShiftF(Mask, Min); + }; switch (UserI->getOpcode()) { default: break; @@ -183,6 +203,17 @@ void DemandedBits::determineLiveOperandBits( AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1); else if (S->hasNoUnsignedWrap()) AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt); + } else { + ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr); + uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1); + uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1); + // similar to Lshr case + GetShiftedRange(Min, Max, /*ShiftLeft=*/false); + const auto *S = cast(UserI); + if (S->hasNoSignedWrap()) + AB |= APInt::getHighBitsSet(BitWidth, Max + 1); + else if (S->hasNoUnsignedWrap()) + AB |= APInt::getHighBitsSet(BitWidth, Max); } } break; @@ -197,6 +228,24 @@ void DemandedBits::determineLiveOperandBits( // (they must be zero). if (cast(UserI)->isExact()) AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + } else { + ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr); + uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1); + uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1); + // Suppose AOut == 0b0000 0001 + // [min, max] = [1, 3] + // iteration 1 shift by 1 mask is 0b0000 0011 + // iteration 2 shift by 2 mask is 0b0000 1111 + // iteration 3, shiftAmnt = 4 > max - min, we stop. + // + // After the iterations we need one more shift by min, + // to move from 0b0000 1111 to --> 0b0001 1110. + // The loop populates the mask relative to (0,...,max-min), + // but we need coverage from (min, max). + // This is why the shift by min is needed. + GetShiftedRange(Min, Max, /*ShiftLeft=*/true); + if (cast(UserI)->isExact()) + AB |= APInt::getLowBitsSet(BitWidth, Max); } } break; @@ -217,6 +266,26 @@ void DemandedBits::determineLiveOperandBits( // (they must be zero). if (cast(UserI)->isExact()) AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + } else { + ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr); + uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1); + uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1); + GetShiftedRange(Min, Max, /*ShiftLeft=*/true); + if (Max && + (AOut & APInt::getHighBitsSet(BitWidth, Max)).getBoolValue()) { + // Suppose AOut = 0011 1100 + // [min, max] = [1, 3] + // ShiftAmount = 1 : Mask is 1000 0000 + // ShiftAmount = 2 : Mask is 1100 0000 + // ShiftAmount = 3 : Mask is 1110 0000 + // The Mask with Max covers every case in [min, max], + // so we are done + AB.setSignBit(); + } + // If the shift is exact, then the low bits are not dead + // (they must be zero). + if (cast(UserI)->isExact()) + AB |= APInt::getLowBitsSet(BitWidth, Max); } } break; diff --git a/llvm/test/Analysis/DemandedBits/ashr.ll b/llvm/test/Analysis/DemandedBits/ashr.ll new file mode 100644 index 000000000000..6185d4c492d8 --- /dev/null +++ b/llvm/test/Analysis/DemandedBits/ashr.ll @@ -0,0 +1,198 @@ +; RUN: opt -S -disable-output -passes="print" < %s 2>&1 | FileCheck %s + +define i8 @test_ashr_const_amount_4(i32 %a) { +; CHECK-LABEL: 'test_ashr_const_amount_4' +; CHECK-DAG: DemandedBits: 0xff for %ashr = ashr i32 %a, 4 +; CHECK-DAG: DemandedBits: 0xff0 for %a in %ashr = ashr i32 %a, 4 +; CHECK-DAG: DemandedBits: 0xffffffff for 4 in %ashr = ashr i32 %a, 4 +; CHECK-DAG: DemandedBits: 0xff for %ashr.t = trunc i32 %ashr to i8 +; CHECK-DAG: DemandedBits: 0xff for %ashr in %ashr.t = trunc i32 %ashr to i8 +; + %ashr = ashr i32 %a, 4 + %ashr.t = trunc i32 %ashr to i8 + ret i8 %ashr.t +} + +define i8 @test_ashr_const_amount_5(i32 %a) { +; CHECK-LABEL: 'test_ashr_const_amount_5' +; CHECK-DAG: DemandedBits: 0xff for %ashr = ashr i32 %a, 5 +; CHECK-DAG: DemandedBits: 0x1fe0 for %a in %ashr = ashr i32 %a, 5 +; CHECK-DAG: DemandedBits: 0xffffffff for 5 in %ashr = ashr i32 %a, 5 +; CHECK-DAG: DemandedBits: 0xff for %ashr.t = trunc i32 %ashr to i8 +; CHECK-DAG: DemandedBits: 0xff for %ashr in %ashr.t = trunc i32 %ashr to i8 +; + %ashr = ashr i32 %a, 5 + %ashr.t = trunc i32 %ashr to i8 + ret i8 %ashr.t +} + +define i8 @test_ashr_const_amount_8(i32 %a) { +; CHECK-LABEL: 'test_ashr_const_amount_8' +; CHECK-DAG: DemandedBits: 0xff for %ashr = ashr i32 %a, 8 +; CHECK-DAG: DemandedBits: 0xff00 for %a in %ashr = ashr i32 %a, 8 +; CHECK-DAG: DemandedBits: 0xffffffff for 8 in %ashr = ashr i32 %a, 8 +; CHECK-DAG: DemandedBits: 0xff for %ashr.t = trunc i32 %ashr to i8 +; CHECK-DAG: DemandedBits: 0xff for %ashr in %ashr.t = trunc i32 %ashr to i8 +; + %ashr = ashr i32 %a, 8 + %ashr.t = trunc i32 %ashr to i8 + ret i8 %ashr.t +} + +define i8 @test_ashr_const_amount_9(i32 %a) { + +; CHECK-LABEL: 'test_ashr_const_amount_9' +; CHECK-DAG: DemandedBits: 0xff for %ashr.t = trunc i32 %ashr to i8 +; CHECK-DAG: DemandedBits: 0xff for %ashr in %ashr.t = trunc i32 %ashr to i8 +; CHECK-DAG: DemandedBits: 0xff for %ashr = ashr i32 %a, 8 +; CHECK-DAG: DemandedBits: 0xff00 for %a in %ashr = ashr i32 %a, 8 +; CHECK-DAG: DemandedBits: 0xffffffff for 8 in %ashr = ashr i32 %a, 8 +; + %ashr = ashr i32 %a, 8 + %ashr.t = trunc i32 %ashr to i8 + ret i8 %ashr.t +} + +define i8 @test_ashr(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_ashr' +; CHECK-DAG: DemandedBits: 0xff for %ashr = ashr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %a in %ashr = ashr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %ashr = ashr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xff for %ashr.t = trunc i32 %ashr to i8 +; CHECK-DAG: DemandedBits: 0xff for %ashr in %ashr.t = trunc i32 %ashr to i8 +; + %ashr = ashr i32 %a, %b + %ashr.t = trunc i32 %ashr to i8 + ret i8 %ashr.t +} + +define i8 @test_ashr_range_1(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_ashr_range_1' +; CHECK-DAG: DemandedBits: 0xff for %shl.t = trunc i32 %ashr to i8 +; CHECK-DAG: DemandedBits: 0xff for %ashr in %shl.t = trunc i32 %ashr to i8 +; CHECK-DAG: DemandedBits: 0xffffffff for %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0x3 for %b in %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0xffffffff for 3 in %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0xff for %ashr = ashr i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0x7ff for %a in %ashr = ashr i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0xffffffff for %b2 in %ashr = ashr i32 %a, %b2 +; + %b2 = and i32 %b, 3 + %ashr = ashr i32 %a, %b2 + %shl.t = trunc i32 %ashr to i8 + ret i8 %shl.t +} + +define i32 @test_ashr_range_2(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_ashr_range_2' +; CHECK-DAG: DemandedBits: 0xffffffff for %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0x3 for %b in %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0xffffffff for 3 in %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0xffffffff for %ashr = ashr i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0xffffffff for %a in %ashr = ashr i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0xffffffff for %b2 in %ashr = ashr i32 %a, %b2 +; + %b2 = and i32 %b, 3 + %ashr = ashr i32 %a, %b2 + ret i32 %ashr +} + +define i32 @test_ashr_range_3(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_ashr_range_3' +; CHECK-DAG: DemandedBits: 0xffff for %ashr = ashr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %a in %ashr = ashr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %ashr = ashr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %shl = shl i32 %ashr, 16 +; CHECK-DAG: DemandedBits: 0xffff for %ashr in %shl = shl i32 %ashr, 16 +; CHECK-DAG: DemandedBits: 0xffffffff for 16 in %shl = shl i32 %ashr, 16 +; + %ashr = ashr i32 %a, %b + %shl = shl i32 %ashr, 16 + ret i32 %shl +} +define i32 @test_ashr_range_4(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_ashr_range_4' +; CHECK-DAG: DemandedBits: 0xffffffff for %shr = lshr i32 %ashr, 8 +; CHECK-DAG: DemandedBits: 0xffffff00 for %ashr in %shr = lshr i32 %ashr, 8 +; CHECK-DAG: DemandedBits: 0xffffffff for 8 in %shr = lshr i32 %ashr, 8 +; CHECK-DAG: DemandedBits: 0xffffff00 for %ashr = ashr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffff00 for %a in %ashr = ashr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %ashr = ashr i32 %a, %b + %ashr = ashr i32 %a, %b + %shr = lshr i32 %ashr, 8 + ret i32 %shr +} + +define i32 @test_ashr_range_5(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_ashr_range_5' +; CHECK-DAG: DemandedBits: 0xffffffff for %2 = and i32 %1, 255 +; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = and i32 %1, 255 +; CHECK-DAG: DemandedBits: 0xffffffff for 255 in %2 = and i32 %1, 255 +; CHECK-DAG: DemandedBits: 0xff for %1 = ashr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %a in %1 = ashr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = ashr i32 %a, %b +; + %1 = ashr i32 %a, %b + %2 = and i32 %1, 255 + ret i32 %2 +} + +define i32 @test_ashr_range_6(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_ashr_range_6' +; CHECK-DAG: DemandedBits: 0xffff0000 for %lshr.1 = ashr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffff0000 for %a in %lshr.1 = ashr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %lshr.1 = ashr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %lshr.2 = ashr i32 %lshr.1, 16 +; CHECK-DAG: DemandedBits: 0xffff0000 for %lshr.1 in %lshr.2 = ashr i32 %lshr.1, 16 +; CHECK-DAG: DemandedBits: 0xffffffff for 16 in %lshr.2 = ashr i32 %lshr.1, 16 +; + %lshr.1 = ashr i32 %a, %b + %lshr.2 = ashr i32 %lshr.1, 16 + ret i32 %lshr.2 +} + +define i8 @test_ashr_var_amount(i32 %a, i32 %b){ +; CHECK-LABEL: 'test_ashr_var_amount' +; CHECK-DAG: DemandedBits: 0xff for %4 = ashr i32 %1, %3 +; CHECK-DAG: DemandedBits: 0xffffffff for %1 in %4 = ashr i32 %1, %3 +; CHECK-DAG: DemandedBits: 0xffffffff for %3 in %4 = ashr i32 %1, %3 +; CHECK-DAG: DemandedBits: 0xff for %2 = trunc i32 %1 to i8 +; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = trunc i32 %1 to i8 +; CHECK-DAG: DemandedBits: 0xffffffff for %1 = add nsw i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %a in %1 = add nsw i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = add nsw i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %3 = zext i8 %2 to i32 +; CHECK-DAG: DemandedBits: 0xff for %2 in %3 = zext i8 %2 to i32 +; CHECK-DAG: DemandedBits: 0xff for %5 = trunc i32 %4 to i8 +; CHECK-DAG: DemandedBits: 0xff for %4 in %5 = trunc i32 %4 to i8 +; + %1 = add nsw i32 %a, %b + %2 = trunc i32 %1 to i8 + %3 = zext i8 %2 to i32 + %4 = ashr i32 %1, %3 + %5 = trunc i32 %4 to i8 + ret i8 %5 +} + +define i8 @test_ashr_var_amount_nsw(i32 %a, i32 %b){ + ; CHECK-LABEL 'test_ashr_var_amount_nsw' + ; CHECK-DAG: DemandedBits: 0xff for %5 = trunc i32 %4 to i8 + ; CHECK-DAG: DemandedBits: 0xff for %4 in %5 = trunc i32 %4 to i8 + ; CHECK-DAG: DemandedBits: 0xffffffff for %1 = add nsw i32 %a, %b + ; CHECK-DAG: DemandedBits: 0xffffffff for %a in %1 = add nsw i32 %a, %b + ; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = add nsw i32 %a, %b + ; CHECK-DAG: DemandedBits: 0xff for %2 = trunc i32 %1 to i8 + ; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = trunc i32 %1 to i8 + ; CHECK-DAG: DemandedBits: 0xffffffff for %3 = zext i8 %2 to i32 + ; CHECK-DAG: DemandedBits: 0xff for %2 in %3 = zext i8 %2 to i32 + ; CHECK-DAG: DemandedBits: 0xff for %4 = ashr exact i32 %1, %3 + ; CHECK-DAG: DemandedBits: 0xffffffff for %1 in %4 = ashr exact i32 %1, %3 + ; CHECK-DAG: DemandedBits: 0xffffffff for %3 in %4 = ashr exact i32 %1, %3 + ; + %1 = add nsw i32 %a, %b + %2 = trunc i32 %1 to i8 + %3 = zext i8 %2 to i32 + %4 = ashr exact i32 %1, %3 + %5 = trunc i32 %4 to i8 + ret i8 %5 +} diff --git a/llvm/test/Analysis/DemandedBits/lshr.ll b/llvm/test/Analysis/DemandedBits/lshr.ll new file mode 100644 index 000000000000..e07f994a1b30 --- /dev/null +++ b/llvm/test/Analysis/DemandedBits/lshr.ll @@ -0,0 +1,198 @@ +; RUN: opt -S -disable-output -passes="print" < %s 2>&1 | FileCheck %s + +define i8 @test_lshr_const_amount_4(i32 %a) { +; CHECK-LABEL: 'test_lshr_const_amount_4' +; CHECK-DAG: DemandedBits: 0xff for %lshr.t = trunc i32 %lshr to i8 +; CHECK-DAG: DemandedBits: 0xff for %lshr in %lshr.t = trunc i32 %lshr to i8 +; CHECK-DAG: DemandedBits: 0xff for %lshr = lshr i32 %a, 4 +; CHECK-DAG: DemandedBits: 0xff0 for %a in %lshr = lshr i32 %a, 4 +; CHECK-DAG: DemandedBits: 0xffffffff for 4 in %lshr = lshr i32 %a, 4 +; + %lshr = lshr i32 %a, 4 + %lshr.t = trunc i32 %lshr to i8 + ret i8 %lshr.t +} + +define i8 @test_lshr_const_amount_5(i32 %a) { +; CHECK-LABEL: 'test_lshr_const_amount_5' +; CHECK-DAG: DemandedBits: 0xff for %lshr = lshr i32 %a, 5 +; CHECK-DAG: DemandedBits: 0x1fe0 for %a in %lshr = lshr i32 %a, 5 +; CHECK-DAG: DemandedBits: 0xffffffff for 5 in %lshr = lshr i32 %a, 5 +; CHECK-DAG: DemandedBits: 0xff for %lshr.t = trunc i32 %lshr to i8 +; CHECK-DAG: DemandedBits: 0xff for %lshr in %lshr.t = trunc i32 %lshr to i8 +; + %lshr = lshr i32 %a, 5 + %lshr.t = trunc i32 %lshr to i8 + ret i8 %lshr.t +} +define i8 @test_lshr_const_amount_8(i32 %a) { +; CHECK-LABEL: 'test_lshr_const_amount_8' +; CHECK-DAG: DemandedBits: 0xff for %lshr.t = trunc i32 %lshr to i8 +; CHECK-DAG: DemandedBits: 0xff for %lshr in %lshr.t = trunc i32 %lshr to i8 +; CHECK-DAG: DemandedBits: 0xff for %lshr = lshr i32 %a, 8 +; CHECK-DAG: DemandedBits: 0xff00 for %a in %lshr = lshr i32 %a, 8 +; CHECK-DAG: DemandedBits: 0xffffffff for 8 in %lshr = lshr i32 %a, 8 +; + %lshr = lshr i32 %a, 8 + %lshr.t = trunc i32 %lshr to i8 + ret i8 %lshr.t +} + +define i8 @test_lshr_const_amount_9(i32 %a) { +; CHECK-LABEL: 'test_lshr_const_amount_9' +; CHECK-DAG: DemandedBits: 0xff for %lshr.t = trunc i32 %lshr to i8 +; CHECK-DAG: DemandedBits: 0xff for %lshr in %lshr.t = trunc i32 %lshr to i8 +; CHECK-DAG: DemandedBits: 0xff for %lshr = lshr i32 %a, 9 +; CHECK-DAG: DemandedBits: 0x1fe00 for %a in %lshr = lshr i32 %a, 9 +; CHECK-DAG: DemandedBits: 0xffffffff for 9 in %lshr = lshr i32 %a, 9 +; + %lshr = lshr i32 %a, 9 + %lshr.t = trunc i32 %lshr to i8 + ret i8 %lshr.t +} + +define i8 @test_lshr(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_lshr' +; CHECK-DAG: DemandedBits: 0xff for %lshr = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %a in %lshr = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %lshr = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xff for %lshr.t = trunc i32 %lshr to i8 +; CHECK-DAG: DemandedBits: 0xff for %lshr in %lshr.t = trunc i32 %lshr to i8 +; + %lshr = lshr i32 %a, %b + %lshr.t = trunc i32 %lshr to i8 + ret i8 %lshr.t +} + +define i8 @test_lshr_range_1(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_lshr_range_1' +; CHECK-DAG: DemandedBits: 0xff for %shl.t = trunc i32 %lshr to i8 +; CHECK-DAG: DemandedBits: 0xff for %lshr in %shl.t = trunc i32 %lshr to i8 +; CHECK-DAG: DemandedBits: 0xff for %lshr = lshr i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0x7ff for %a in %lshr = lshr i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0xffffffff for %b2 in %lshr = lshr i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0xffffffff for %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0x3 for %b in %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0xffffffff for 3 in %b2 = and i32 %b, 3 +; + %b2 = and i32 %b, 3 + %lshr = lshr i32 %a, %b2 + %shl.t = trunc i32 %lshr to i8 + ret i8 %shl.t +} + +define i32 @test_lshr_range_2(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_lshr_range_2' +; CHECK-DAG: DemandedBits: 0xffffffff for %lshr = lshr i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0xffffffff for %a in %lshr = lshr i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0xffffffff for %b2 in %lshr = lshr i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0xffffffff for %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0x3 for %b in %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0xffffffff for 3 in %b2 = and i32 %b, 3 +; + %b2 = and i32 %b, 3 + %lshr = lshr i32 %a, %b2 + ret i32 %lshr +} + +define i32 @test_lshr_range_3(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_lshr_range_3' +; CHECK-DAG: DemandedBits: 0xffff for %lshr = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %a in %lshr = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %lshr = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %shl = shl i32 %lshr, 16 +; CHECK-DAG: DemandedBits: 0xffff for %lshr in %shl = shl i32 %lshr, 16 +; CHECK-DAG: DemandedBits: 0xffffffff for 16 in %shl = shl i32 %lshr, 16 +; + %lshr = lshr i32 %a, %b + %shl = shl i32 %lshr, 16 + ret i32 %shl +} + +define i32 @test_lshr_range_4(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_lshr_range_4' +; CHECK-DAG: DemandedBits: 0xffffff00 for %lshr = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffff00 for %a in %lshr = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %lshr = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %shr = ashr i32 %lshr, 8 +; CHECK-DAG: DemandedBits: 0xffffff00 for %lshr in %shr = ashr i32 %lshr, 8 +; CHECK-DAG: DemandedBits: 0xffffffff for 8 in %shr = ashr i32 %lshr, 8 + %lshr = lshr i32 %a, %b + %shr = ashr i32 %lshr, 8 + ret i32 %shr +} + +define i32 @test_lshr_range_5(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_lshr_range_5' +; CHECK-DAG: DemandedBits: 0xff for %1 = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %a in %1 = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %2 = and i32 %1, 255 +; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = and i32 %1, 255 +; CHECK-DAG: DemandedBits: 0xffffffff for 255 in %2 = and i32 %1, 255 +; + %1 = lshr i32 %a, %b + %2 = and i32 %1, 255 + ret i32 %2 +} + +define i32 @test_lshr_range_6(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_lshr_range_6' +; CHECK-DAG: DemandedBits: 0xffff0000 for %lshr.1 = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffff0000 for %a in %lshr.1 = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %lshr.1 = lshr i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %lshr.2 = lshr i32 %lshr.1, 16 +; CHECK-DAG: DemandedBits: 0xffff0000 for %lshr.1 in %lshr.2 = lshr i32 %lshr.1, 16 +; CHECK-DAG: DemandedBits: 0xffffffff for 16 in %lshr.2 = lshr i32 %lshr.1, 16 +; + %lshr.1 = lshr i32 %a, %b + %lshr.2 = lshr i32 %lshr.1, 16 + ret i32 %lshr.2 +} + + +define i8 @test_lshr_var_amount(i32 %a, i32 %b){ +; CHECK-LABEL: 'test_lshr_var_amount' +; CHECK-DAG: DemandedBits: 0xff for %4 = lshr i32 %1, %3 +; CHECK-DAG: DemandedBits: 0xffffffff for %1 in %4 = lshr i32 %1, %3 +; CHECK-DAG: DemandedBits: 0xffffffff for %3 in %4 = lshr i32 %1, %3 +; CHECK-DAG: DemandedBits: 0xff for %5 = trunc i32 %4 to i8 +; CHECK-DAG: DemandedBits: 0xff for %4 in %5 = trunc i32 %4 to i8 +; CHECK-DAG: DemandedBits: 0xffffffff for %3 = zext i8 %2 to i32 +; CHECK-DAG: DemandedBits: 0xff for %2 in %3 = zext i8 %2 to i32 +; CHECK-DAG: DemandedBits: 0xffffffff for %1 = add nsw i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %a in %1 = add nsw i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = add nsw i32 %a, %b +; CHECK-DAG: DemandedBits: 0xff for %2 = trunc i32 %1 to i8 +; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = trunc i32 %1 to i8 +; + %1 = add nsw i32 %a, %b + %2 = trunc i32 %1 to i8 + %3 = zext i8 %2 to i32 + %4 = lshr i32 %1, %3 + %5 = trunc i32 %4 to i8 + ret i8 %5 +} + +define i8 @test_lshr_var_amount_exact(i32 %a, i32 %b){ + ; CHECK-LABEL 'test_lshr_var_amount_nsw' + ; CHECK-DAG: DemandedBits: 0xffffffff for %1 = add nsw i32 %a, %b + ; CHECK-DAG: DemandedBits: 0xffffffff for %a in %1 = add nsw i32 %a, %b + ; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = add nsw i32 %a, %b + ; CHECK-DAG: DemandedBits: 0xff for %2 = trunc i32 %1 to i8 + ; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = trunc i32 %1 to i8 + ; CHECK-DAG: DemandedBits: 0xffffffff for %3 = zext i8 %2 to i32 + ; CHECK-DAG: DemandedBits: 0xff for %2 in %3 = zext i8 %2 to i32 + ; CHECK-DAG: DemandedBits: 0xff for %4 = lshr exact i32 %1, %3 + ; CHECK-DAG: DemandedBits: 0xffffffff for %1 in %4 = lshr exact i32 %1, %3 + ; CHECK-DAG: DemandedBits: 0xffffffff for %3 in %4 = lshr exact i32 %1, %3 + ; CHECK-DAG: DemandedBits: 0xff for %5 = trunc i32 %4 to i8 + ; CHECK-DAG: DemandedBits: 0xff for %4 in %5 = trunc i32 %4 to i8 + ; + %1 = add nsw i32 %a, %b + %2 = trunc i32 %1 to i8 + %3 = zext i8 %2 to i32 + %4 = lshr exact i32 %1, %3 + %5 = trunc i32 %4 to i8 + ret i8 %5 +} diff --git a/llvm/test/Analysis/DemandedBits/shl.ll b/llvm/test/Analysis/DemandedBits/shl.ll index e41f5f410773..c872d2d854e8 100644 --- a/llvm/test/Analysis/DemandedBits/shl.ll +++ b/llvm/test/Analysis/DemandedBits/shl.ll @@ -57,10 +57,142 @@ define i8 @test_shl(i32 %a, i32 %b) { ; CHECK-DAG: DemandedBits: 0xff for %shl.t = trunc i32 %shl to i8 ; CHECK-DAG: DemandedBits: 0xff for %shl in %shl.t = trunc i32 %shl to i8 ; CHECK-DAG: DemandedBits: 0xff for %shl = shl i32 %a, %b -; CHECK-DAG: DemandedBits: 0xffffffff for %a in %shl = shl i32 %a, %b +; CHECK-DAG: DemandedBits: 0xff for %a in %shl = shl i32 %a, %b ; CHECK-DAG: DemandedBits: 0xffffffff for %b in %shl = shl i32 %a, %b ; %shl = shl i32 %a, %b %shl.t = trunc i32 %shl to i8 ret i8 %shl.t } + +define i8 @test_shl_range_1(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_shl_range_1' +; CHECK-DAG: DemandedBits: 0xff for %shl = shl i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0xff for %a in %shl = shl i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0xffffffff for %b2 in %shl = shl i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0xff for %shl.t = trunc i32 %shl to i8 +; CHECK-DAG: DemandedBits: 0xff for %shl in %shl.t = trunc i32 %shl to i8 +; CHECK-DAG: DemandedBits: 0xffffffff for %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0x3 for %b in %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0xffffffff for 3 in %b2 = and i32 %b, 3 +; + %b2 = and i32 %b, 3 + %shl = shl i32 %a, %b2 + %shl.t = trunc i32 %shl to i8 + ret i8 %shl.t +} + +define i32 @test_shl_range_2(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_shl_range_2' +; CHECK-DAG: DemandedBits: 0xffffffff for %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0x3 for %b in %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0xffffffff for 3 in %b2 = and i32 %b, 3 +; CHECK-DAG: DemandedBits: 0xffffffff for %shl = shl i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0xffffffff for %a in %shl = shl i32 %a, %b2 +; CHECK-DAG: DemandedBits: 0xffffffff for %b2 in %shl = shl i32 %a, %b2 +; + %b2 = and i32 %b, 3 + %shl = shl i32 %a, %b2 + ret i32 %shl +} + +define i32 @test_shl_range_3(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_shl_range_3' +; CHECK-DAG: DemandedBits: 0xffffffff for %shr = lshr i32 %shl, 16 +; CHECK-DAG: DemandedBits: 0xffff0000 for %shl in %shr = lshr i32 %shl, 16 +; CHECK-DAG: DemandedBits: 0xffffffff for 16 in %shr = lshr i32 %shl, 16 +; CHECK-DAG: DemandedBits: 0xffff0000 for %shl = shl i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %a in %shl = shl i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %shl = shl i32 %a, %b +; + %shl = shl i32 %a, %b + %shr = lshr i32 %shl, 16 + ret i32 %shr +} + +define i32 @test_shl_range_4(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_shl_range_4' +; CHECK-DAG: DemandedBits: 0xffffffff for %shr = ashr i32 %shl, 8 +; CHECK-DAG: DemandedBits: 0xffffff00 for %shl in %shr = ashr i32 %shl, 8 +; CHECK-DAG: DemandedBits: 0xffffffff for 8 in %shr = ashr i32 %shl, 8 +; CHECK-DAG: DemandedBits: 0xffffff00 for %shl = shl i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %a in %shl = shl i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %shl = shl i32 %a, %b + %shl = shl i32 %a, %b + %shr = ashr i32 %shl, 8 + ret i32 %shr +} + +define i32 @test_shl_range_5(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_shl_range_5' +; CHECK-DAG: DemandedBits: 0xff for %1 = shl i32 %a, %b +; CHECK-DAG: DemandedBits: 0xff for %a in %1 = shl i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = shl i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %2 = and i32 %1, 255 +; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = and i32 %1, 255 +; CHECK-DAG: DemandedBits: 0xffffffff for 255 in %2 = and i32 %1, 255 +; + %1 = shl i32 %a, %b + %2 = and i32 %1, 255 + ret i32 %2 +} + +define i32 @test_shl_range_6(i32 %a, i32 %b) { +; CHECK-LABEL: 'test_shl_range_6' +; CHECK-DAG: DemandedBits: 0xffffffff for %shl.2 = shl i32 %shl.1, 16 +; CHECK-DAG: DemandedBits: 0xffff for %shl.1 in %shl.2 = shl i32 %shl.1, 16 +; CHECK-DAG: DemandedBits: 0xffffffff for 16 in %shl.2 = shl i32 %shl.1, 16 +; CHECK-DAG: DemandedBits: 0xffff for %shl.1 = shl i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffff for %a in %shl.1 = shl i32 %a, %b +; CHECK-DAG: DemandedBits: 0xffffffff for %b in %shl.1 = shl i32 %a, %b +; + %shl.1 = shl i32 %a, %b + %shl.2 = shl i32 %shl.1, 16 + ret i32 %shl.2 +} + +define i8 @test_shl_var_amount(i32 %a, i32 %b){ +; CHECK-LABEL: 'test_shl_var_amount' +; CHECK-DAG: DemandedBits: 0xff for %5 = trunc i32 %4 to i8 +; CHECK-DAG: DemandedBits: 0xff for %4 in %5 = trunc i32 %4 to i8 +; CHECK-DAG: DemandedBits: 0xff for %4 = shl i32 %1, %3 +; CHECK-DAG: DemandedBits: 0xff for %1 in %4 = shl i32 %1, %3 +; CHECK-DAG: DemandedBits: 0xffffffff for %3 in %4 = shl i32 %1, %3 +; CHECK-DAG: DemandedBits: 0xff for %2 = trunc i32 %1 to i8 +; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = trunc i32 %1 to i8 +; CHECK-DAG: DemandedBits: 0xffffffff for %3 = zext i8 %2 to i32 +; CHECK-DAG: DemandedBits: 0xff for %2 in %3 = zext i8 %2 to i32 +; CHECK-DAG: DemandedBits: 0xff for %1 = add nsw i32 %a, %b +; CHECK-DAG: DemandedBits: 0xff for %a in %1 = add nsw i32 %a, %b +; CHECK-DAG: DemandedBits: 0xff for %b in %1 = add nsw i32 %a, %b +; + %1 = add nsw i32 %a, %b + %2 = trunc i32 %1 to i8 + %3 = zext i8 %2 to i32 + %4 = shl i32 %1, %3 + %5 = trunc i32 %4 to i8 + ret i8 %5 +} + +define i8 @test_shl_var_amount_nsw(i32 %a, i32 %b){ + ; CHECK-LABEL 'test_shl_var_amount_nsw' + ; CHECK-DAG: DemandedBits: 0xff for %5 = trunc i32 %4 to i8 + ; CHECK-DAG: DemandedBits: 0xff for %4 in %5 = trunc i32 %4 to i8 + ; CHECK-DAG: DemandedBits: 0xff for %4 = shl nsw i32 %1, %3 + ; CHECK-DAG: DemandedBits: 0xffffffff for %1 in %4 = shl nsw i32 %1, %3 + ; CHECK-DAG: DemandedBits: 0xffffffff for %3 in %4 = shl nsw i32 %1, %3 + ; CHECK-DAG: DemandedBits: 0xffffffff for %3 = zext i8 %2 to i32 + ; CHECK-DAG: DemandedBits: 0xff for %2 in %3 = zext i8 %2 to i32 + ; CHECK-DAG: DemandedBits: 0xff for %2 = trunc i32 %1 to i8 + ; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = trunc i32 %1 to i8 + ; CHECK-DAG: DemandedBits: 0xffffffff for %1 = add nsw i32 %a, %b + ; CHECK-DAG: DemandedBits: 0xffffffff for %a in %1 = add nsw i32 %a, %b + ; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = add nsw i32 %a, %b + ; + %1 = add nsw i32 %a, %b + %2 = trunc i32 %1 to i8 + %3 = zext i8 %2 to i32 + %4 = shl nsw i32 %1, %3 + %5 = trunc i32 %4 to i8 + ret i8 %5 +} From 6960bf556c3eb7e3fcd5da3de28f55310bea341e Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Mon, 18 Aug 2025 10:20:31 -0700 Subject: [PATCH 07/27] [Github] Drop llvm-project-tests All users of this have been claned up so we can now drop it fully. Reviewers: cmtice, tstellar Reviewed By: cmtice Pull Request: https://github.com/llvm/llvm-project/pull/153877 --- .github/workflows/llvm-project-tests.yml | 149 ------------------ .../workflows/llvm-project-workflow-tests.yml | 32 ---- 2 files changed, 181 deletions(-) delete mode 100644 .github/workflows/llvm-project-tests.yml delete mode 100644 .github/workflows/llvm-project-workflow-tests.yml diff --git a/.github/workflows/llvm-project-tests.yml b/.github/workflows/llvm-project-tests.yml deleted file mode 100644 index 8621a3b59218..000000000000 --- a/.github/workflows/llvm-project-tests.yml +++ /dev/null @@ -1,149 +0,0 @@ -name: LLVM Project Tests - -permissions: - contents: read - -on: - workflow_dispatch: - inputs: - build_target: - required: false - projects: - required: false - extra_cmake_args: - required: false - os_list: - required: false - default: '["ubuntu-24.04", "windows-2019", "macOS-13"]' - python_version: - required: false - type: string - default: '3.11' - workflow_call: - inputs: - build_target: - required: false - type: string - default: "all" - - projects: - required: true - type: string - - extra_cmake_args: - required: false - type: string - - os_list: - required: false - type: string - # Use windows-2019 due to: - # https://developercommunity.visualstudio.com/t/Prev-Issue---with-__assume-isnan-/1597317 - default: '["ubuntu-24.04", "windows-2019", "macOS-13"]' - - python_version: - required: false - type: string - default: '3.11' - -concurrency: - # Skip intermediate builds: always. - # Cancel intermediate builds: only if it is a pull request build. - # If the group name here is the same as the group name in the workflow that includes - # this one, then the action will try to wait on itself and get stuck. - group: llvm-project-${{ github.workflow }}-${{ inputs.projects }}-${{ inputs.python_version }}${{ github.ref }} - cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} - -jobs: - lit-tests: - name: Lit Tests - runs-on: ${{ matrix.os }} - container: - image: ${{(startsWith(matrix.os, 'ubuntu') && 'ghcr.io/llvm/ci-ubuntu-24.04:latest') || null}} - volumes: - - /mnt/:/mnt/ - strategy: - fail-fast: false - matrix: - os: ${{ fromJSON(inputs.os_list) }} - steps: - - name: Setup Windows - if: startsWith(matrix.os, 'windows') - uses: llvm/actions/setup-windows@main - with: - arch: amd64 - # On Windows, starting with win19/20220814.1, cmake choose the 32-bit - # python3.10.6 libraries instead of the 64-bit libraries when building - # lldb. Using this setup-python action to make 3.10 the default - # python fixes this. - - name: Setup Python - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 - with: - python-version: ${{ inputs.python_version }} - - name: Install Ninja - if: runner.os != 'Linux' - uses: llvm/actions/install-ninja@main - # actions/checkout deletes any existing files in the new git directory, - # so this needs to either run before ccache-action or it has to use - # clean: false. - - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - with: - fetch-depth: 250 - - name: Setup ccache - uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17 - with: - # A full build of llvm, clang, lld, and lldb takes about 250MB - # of ccache space. There's not much reason to have more than this, - # because we usually won't need to save cache entries from older - # builds. Also, there is an overall 10GB cache limit, and each - # run creates a new cache entry so we want to ensure that we have - # enough cache space for all the tests to run at once and still - # fit under the 10 GB limit. - # Default to 2G to workaround: https://github.com/hendrikmuhs/ccache-action/issues/174 - max-size: 2G - key: ${{ matrix.os }} - variant: sccache - - name: Build and Test - env: - # Workaround for https://github.com/actions/virtual-environments/issues/5900. - # This should be a no-op for non-mac OSes - PKG_CONFIG_PATH: /usr/local/Homebrew/Library/Homebrew/os/mac/pkgconfig//12 - shell: bash - id: build-llvm - run: | - if [ "${{ runner.os }}" == "Linux" ]; then - builddir="/mnt/build/" - sudo mkdir -p $builddir - sudo chown gha $builddir - extra_cmake_args="-DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang" - else - builddir="$(pwd)"/build - fi - if [ "${{ runner.os }}" == "macOS" ]; then - # Workaround test failure on some lld tests on MacOS - # https://github.com/llvm/llvm-project/issues/81967 - extra_cmake_args="-DLLVM_DISABLE_ASSEMBLY_FILES=ON" - fi - echo "llvm-builddir=$builddir" >> "$GITHUB_OUTPUT" - cmake -G Ninja \ - -B "$builddir" \ - -S llvm \ - -DLLVM_ENABLE_PROJECTS="${{ inputs.projects }}" \ - -DCMAKE_BUILD_TYPE=Release \ - -DLLVM_ENABLE_ASSERTIONS=ON \ - -DLLDB_INCLUDE_TESTS=OFF \ - -DLIBCLC_TARGETS_TO_BUILD="amdgcn--;amdgcn--amdhsa;r600--;nvptx--;nvptx64--;nvptx--nvidiacl;nvptx64--nvidiacl" \ - -DCMAKE_C_COMPILER_LAUNCHER=sccache \ - -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \ - $extra_cmake_args \ - ${{ inputs.extra_cmake_args }} - ninja -C "$builddir" '${{ inputs.build_target }}' - - - name: Build and Test libclc - if: "!startsWith(matrix.os, 'windows') && contains(inputs.projects, 'libclc')" - env: - LLVM_BUILDDIR: ${{ steps.build-llvm.outputs.llvm-builddir }} - run: | - # The libclc tests don't have a generated check target so all we can - # do is build it. - ninja -C "$LLVM_BUILDDIR" diff --git a/.github/workflows/llvm-project-workflow-tests.yml b/.github/workflows/llvm-project-workflow-tests.yml deleted file mode 100644 index a2539b279be0..000000000000 --- a/.github/workflows/llvm-project-workflow-tests.yml +++ /dev/null @@ -1,32 +0,0 @@ -# This workflow will test the llvm-project-tests workflow in PRs -# targetting the main branch. Since this workflow doesn't normally -# run on main PRs, we need some way to test it to ensure new updates -# don't break it. - -name: LLVM Workflow Test - -permissions: - contents: read - -on: - pull_request: - branches: - - 'main' - paths: - - '.github/workflows/llvm-project-tests.yml' - - '.github/workflows/llvm-project-workflow-tests.yml' - -concurrency: - # Skip intermediate builds: always. - # Cancel intermediate builds: only if it is a pull request build. - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} - -jobs: - llvm-test: - if: github.repository_owner == 'llvm' - name: Build and Test - uses: ./.github/workflows/llvm-project-tests.yml - with: - build_target: check-all - projects: clang;lld;libclc;lldb From 99829573cc8460782e4f10713ef24d5af9f82036 Mon Sep 17 00:00:00 2001 From: Shafik Yaghmour Date: Mon, 18 Aug 2025 10:27:37 -0700 Subject: [PATCH 08/27] [Clang][Webassembly] Remove unrachable code in ParseTypeQualifierListOpt (#153729) Static analysis flagged this goto as unreachable and indeed it is, so removing it. --- clang/lib/Parse/ParseDecl.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index fd53cca5a13f..96f1a53922d1 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -6224,7 +6224,6 @@ void Parser::ParseTypeQualifierListOpt( case tok::kw___funcref: ParseWebAssemblyFuncrefTypeAttribute(DS.getAttributes()); continue; - goto DoneWithTypeQuals; case tok::kw___pascal: if (AttrReqs & AR_VendorAttributesParsed) { From 7f27482a32180def47c71f490501ea0e560bfa9f Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Mon, 18 Aug 2025 13:32:54 -0400 Subject: [PATCH 09/27] [AMDGPU][LowerBufferFatPointers] Fix lack of rewrite when loading/storing null (#154128) Fixes #154056. The fat buffer lowering pass was erroniously detecting that it did not need to run on functions that only load/store to the null constant (or other such constants). We thought this would be covered by specializing constants out to instructions, but that doesn't account foc trivial constants like null. Therefore, we check the operands of instructions for buffer fat pointers in order to find such constants and ensure the pass runs. --------- Co-authored-by: Nikita Popov --- .../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 6 ++- .../lower-buffer-fat-pointers-constants.ll | 40 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index ed73dc890390..139cad60ebcb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -2366,8 +2366,12 @@ static bool containsBufferFatPointers(const Function &F, BufferFatPtrToStructTypeMap *TypeMap) { bool HasFatPointers = false; for (const BasicBlock &BB : F) - for (const Instruction &I : BB) + for (const Instruction &I : BB) { HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType())); + // Catch null pointer constants in loads, stores, etc. + for (const Value *V : I.operand_values()) + HasFatPointers |= (V->getType() != TypeMap->remapType(V->getType())); + } return HasFatPointers; } diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-constants.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-constants.ll index a0c1e573f8fb..a09e392b89e6 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-constants.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-constants.ll @@ -223,3 +223,43 @@ define i32 @fancy_zero() { ptr addrspace(7) addrspacecast (ptr addrspace(8) @buf to ptr addrspace(7)) to i32) } + +define i32 @load_null() { +; CHECK-LABEL: define i32 @load_null +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 null, i32 0, i32 0, i32 0) +; CHECK-NEXT: ret i32 [[X]] +; + %x = load i32, ptr addrspace(7) null, align 4 + ret i32 %x +} + +define void @store_null() { +; CHECK-LABEL: define void @store_null +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 0, ptr addrspace(8) align 4 null, i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + store i32 0, ptr addrspace(7) null, align 4 + ret void +} + +define i32 @load_poison() { +; CHECK-LABEL: define i32 @load_poison +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 poison, i32 poison, i32 0, i32 0) +; CHECK-NEXT: ret i32 [[X]] +; + %x = load i32, ptr addrspace(7) poison, align 4 + ret i32 %x +} + +define void @store_poison() { +; CHECK-LABEL: define void @store_poison +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 0, ptr addrspace(8) align 4 poison, i32 poison, i32 0, i32 0) +; CHECK-NEXT: ret void +; + store i32 0, ptr addrspace(7) poison, align 4 + ret void +} From 350f4a3e3b0ebd9695f9c2194db5fd86ff551489 Mon Sep 17 00:00:00 2001 From: LauraElanorJones Date: Mon, 18 Aug 2025 10:47:14 -0700 Subject: [PATCH 10/27] Decent to Descent (#154040) [lldb] Rename RecursiveDecentFormatter to RecursiveDescentFormatter (NFC) --- lldb/packages/Python/lldbsuite/test/lldbutil.py | 7 +++---- lldb/test/API/python_api/value/TestValueAPI.py | 2 +- lldb/utils/lui/lldbutil.py | 7 +++---- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/lldbutil.py b/lldb/packages/Python/lldbsuite/test/lldbutil.py index 8112705438c1..b8a78b71f5ec 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbutil.py +++ b/lldb/packages/Python/lldbsuite/test/lldbutil.py @@ -1464,8 +1464,8 @@ class ChildVisitingFormatter(BasicFormatter): return output.getvalue() -class RecursiveDecentFormatter(BasicFormatter): - """The recursive decent formatter prints the value and the decendents. +class RecursiveDescentFormatter(BasicFormatter): + """The recursive descent formatter prints the value and the descendents. The constructor takes two keyword args: indent_level, which defaults to 0, and indent_child, which defaults to 2. The current indentation level is @@ -1482,7 +1482,6 @@ class RecursiveDecentFormatter(BasicFormatter): output = io.StringIO() else: output = buffer - BasicFormatter.format(self, value, buffer=output, indent=self.lindent) new_indent = self.lindent + self.cindent for child in value: @@ -1490,7 +1489,7 @@ class RecursiveDecentFormatter(BasicFormatter): BasicFormatter.format(self, child, buffer=output, indent=new_indent) else: if child.GetNumChildren() > 0: - rdf = RecursiveDecentFormatter(indent_level=new_indent) + rdf = RecursiveDescentFormatter(indent_level=new_indent) rdf.format(child, buffer=output) else: BasicFormatter.format(self, child, buffer=output, indent=new_indent) diff --git a/lldb/test/API/python_api/value/TestValueAPI.py b/lldb/test/API/python_api/value/TestValueAPI.py index 0da57346212d..907992bf05c0 100644 --- a/lldb/test/API/python_api/value/TestValueAPI.py +++ b/lldb/test/API/python_api/value/TestValueAPI.py @@ -83,7 +83,7 @@ class ValueAPITestCase(TestBase): fmt = lldbutil.BasicFormatter() cvf = lldbutil.ChildVisitingFormatter(indent_child=2) - rdf = lldbutil.RecursiveDecentFormatter(indent_child=2) + rdf = lldbutil.RecursiveDescentFormatter(indent_child=2) if self.TraceOn(): print(fmt.format(days_of_week)) print(cvf.format(days_of_week)) diff --git a/lldb/utils/lui/lldbutil.py b/lldb/utils/lui/lldbutil.py index 6cbf4a302f65..140317af3670 100644 --- a/lldb/utils/lui/lldbutil.py +++ b/lldb/utils/lui/lldbutil.py @@ -1040,8 +1040,8 @@ class ChildVisitingFormatter(BasicFormatter): return output.getvalue() -class RecursiveDecentFormatter(BasicFormatter): - """The recursive decent formatter prints the value and the decendents. +class RecursiveDescentFormatter(BasicFormatter): + """The recursive descent formatter prints the value and the descendents. The constructor takes two keyword args: indent_level, which defaults to 0, and indent_child, which defaults to 2. The current indentation level is @@ -1058,7 +1058,6 @@ class RecursiveDecentFormatter(BasicFormatter): output = io.StringIO() else: output = buffer - BasicFormatter.format(self, value, buffer=output, indent=self.lindent) new_indent = self.lindent + self.cindent for child in value: @@ -1066,7 +1065,7 @@ class RecursiveDecentFormatter(BasicFormatter): BasicFormatter.format(self, child, buffer=output, indent=new_indent) else: if child.GetNumChildren() > 0: - rdf = RecursiveDecentFormatter(indent_level=new_indent) + rdf = RecursiveDescentFormatter(indent_level=new_indent) rdf.format(child, buffer=output) else: BasicFormatter.format(self, child, buffer=output, indent=new_indent) From 58de8f2c25291549dc1cabe364d399e564bca042 Mon Sep 17 00:00:00 2001 From: Justin Fargnoli Date: Mon, 18 Aug 2025 10:48:49 -0700 Subject: [PATCH 11/27] [Inliner] Add option (default off) to inline all calls regardless of the cost (#152365) Add a default off option to the inline cost calculation to always inline all viable calls regardless of the cost/benefit and cost/threshold calculations. For performance reasons, some users require that all calls be inlined. Rather than forcing them to adjust the inlining threshold to an arbitrarily high value, offer an option to inline all calls. --- llvm/lib/Analysis/InlineCost.cpp | 8 ++ .../Inline/inline-all-viable-calls.ll | 114 ++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 llvm/test/Transforms/Inline/inline-all-viable-calls.ll diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index 22f4d08448a2..757f68999691 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -180,6 +180,10 @@ static cl::opt DisableGEPConstOperand( "disable-gep-const-evaluation", cl::Hidden, cl::init(false), cl::desc("Disables evaluation of GetElementPtr with constant operands")); +static cl::opt InlineAllViableCalls( + "inline-all-viable-calls", cl::Hidden, cl::init(false), + cl::desc("Inline all viable calls, even if they exceed the inlining " + "threshold")); namespace llvm { std::optional getStringFnAttrAsInt(const Attribute &Attr) { if (Attr.isValid()) { @@ -3272,6 +3276,10 @@ InlineCost llvm::getInlineCost( return llvm::InlineCost::getNever(UserDecision->getFailureReason()); } + if (InlineAllViableCalls && isInlineViable(*Callee).isSuccess()) + return llvm::InlineCost::getAlways( + "Inlining forced by -inline-all-viable-calls"); + LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() << "... (caller:" << Call.getCaller()->getName() << ")\n"); diff --git a/llvm/test/Transforms/Inline/inline-all-viable-calls.ll b/llvm/test/Transforms/Inline/inline-all-viable-calls.ll new file mode 100644 index 000000000000..a06ec1acd4ef --- /dev/null +++ b/llvm/test/Transforms/Inline/inline-all-viable-calls.ll @@ -0,0 +1,114 @@ +; RUN: opt -passes=inline -inline-threshold=0 -inline-all-viable-calls -S < %s | FileCheck %s + +; Check that viable calls that are beyond the cost threshold are still inlined. +define i32 @callee_simple(i32 %x) { + %1 = add i32 %x, 1 + %2 = mul i32 %1, 2 + %3 = sub i32 %2, 1 + %4 = add i32 %3, 3 + %5 = mul i32 %4, 2 + %6 = sub i32 %5, 2 + %7 = add i32 %6, 1 + ret i32 %7 +} + +; Check that user decisions are respected. +define i32 @callee_alwaysinline(i32 %x) alwaysinline { + %sub = sub i32 %x, 3 + ret i32 %sub +} + +define i32 @callee_noinline(i32 %x) noinline { + %div = sdiv i32 %x, 2 + ret i32 %div +} + +define i32 @callee_optnone(i32 %x) optnone noinline { + %rem = srem i32 %x, 2 + ret i32 %rem +} + +define i32 @caller(i32 %a) { +; CHECK-LABEL: define i32 @caller( +; CHECK-SAME: i32 [[A:%.*]]) { +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[A]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[TMP8]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], 2 +; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[TMP6]], 1 +; CHECK-NEXT: [[SUB_I:%.*]] = sub i32 [[ADD_I]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_noinline(i32 [[SUB_I]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @callee_optnone(i32 [[TMP1]]) +; CHECK-NEXT: [[SUM:%.*]] = add i32 [[TMP2]], [[TMP1]] +; CHECK-NEXT: ret i32 [[SUM]] +; + %1 = call i32 @callee_simple(i32 %a) + %2 = call i32 @callee_alwaysinline(i32 %1) + %3 = call i32 @callee_noinline(i32 %2) + %4 = call i32 @callee_optnone(i32 %3) + %sum = add i32 %4, %3 + ret i32 %sum +} + +; Check that non-viable calls are not inlined + +; Test recursive function is not inlined +define i32 @recursive(i32 %n) { +entry: + %cmp = icmp eq i32 %n, 0 + br i1 %cmp, label %base, label %recurse + +base: + ret i32 0 + +recurse: + %dec = sub i32 %n, 1 + %rec = call i32 @recursive(i32 %dec) + %add = add i32 %rec, 1 + ret i32 %add +} + +define i32 @call_recursive(i32 %x) { +; CHECK-LABEL: define i32 @call_recursive( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i32 @recursive(i32 [[X]]) +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @recursive(i32 %x) + ret i32 %r +} + +; Test indirectbr prevents inlining +define void @has_indirectbr(ptr %ptr, i32 %cond) { +entry: + switch i32 %cond, label %default [ + i32 0, label %target0 + i32 1, label %target1 + ] + +target0: + br label %end + +target1: + br label %end + +default: + br label %end + +end: + indirectbr ptr %ptr, [label %target0, label %target1] + ret void +} + +define void @call_indirectbr(ptr %p, i32 %c) { +; CHECK-LABEL: define void @call_indirectbr( +; CHECK-SAME: ptr [[P:%.*]], i32 [[C:%.*]]) { +; CHECK-NEXT: call void @has_indirectbr(ptr [[P]], i32 [[C]]) +; CHECK-NEXT: ret void +; + call void @has_indirectbr(ptr %p, i32 %c) + ret void +} + From 7e8ff2afa9ddfe1d7c42bb58cc9523006c34396b Mon Sep 17 00:00:00 2001 From: Shaoce SUN Date: Tue, 19 Aug 2025 01:52:24 +0800 Subject: [PATCH 12/27] [RISCV][GISel] Optimize +0.0 to use fcvt.d.w for s64 on rv32 (#153978) Resolve the TODO: on RV32, when constructing the double-precision constant `+0.0` for `s64`, `BuildPairF64Pseudo` can be optimized to use the `fcvt.d.w` instruction to generate the result directly. --- .../RISCV/GISel/RISCVInstructionSelector.cpp | 15 +++++- .../CodeGen/RISCV/GlobalISel/double-arith.ll | 48 ++++--------------- .../instruction-select/fp-constant.mir | 6 +-- 3 files changed, 24 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index f83c2b6da892..51ea3fc5f677 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -736,7 +736,6 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { } case TargetOpcode::G_FCONSTANT: { // TODO: Use constant pool for complex constants. - // TODO: Optimize +0.0 to use fcvt.d.w for s64 on rv32. Register DstReg = MI.getOperand(0).getReg(); const APFloat &FPimm = MI.getOperand(1).getFPImm()->getValueAPF(); APInt Imm = FPimm.bitcastToAPInt(); @@ -753,8 +752,22 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { if (!FMV.constrainAllUses(TII, TRI, RBI)) return false; } else { + // s64 on rv32 assert(Size == 64 && !Subtarget->is64Bit() && "Unexpected size or subtarget"); + + if (Imm.isNonNegative() && Imm.isZero()) { + // Optimize +0.0 to use fcvt.d.w + MachineInstrBuilder FCVT = + MIB.buildInstr(RISCV::FCVT_D_W, {DstReg}, {Register(RISCV::X0)}) + .addImm(RISCVFPRndMode::RNE); + if (!FCVT.constrainAllUses(TII, TRI, RBI)) + return false; + + MI.eraseFromParent(); + return true; + } + // Split into two pieces and build through the stack. Register GPRRegHigh = MRI->createVirtualRegister(&RISCV::GPRRegClass); Register GPRRegLow = MRI->createVirtualRegister(&RISCV::GPRRegClass); diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll index cb2037f5fb02..4eb7646d13a3 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll @@ -395,13 +395,9 @@ define double @fmadd_d(double %a, double %b, double %c) nounwind { define double @fmsub_d(double %a, double %b, double %c) nounwind { ; RV32IFD-LABEL: fmsub_d: ; RV32IFD: # %bb.0: -; RV32IFD-NEXT: addi sp, sp, -16 -; RV32IFD-NEXT: sw zero, 8(sp) -; RV32IFD-NEXT: sw zero, 12(sp) -; RV32IFD-NEXT: fld fa5, 8(sp) +; RV32IFD-NEXT: fcvt.d.w fa5, zero ; RV32IFD-NEXT: fadd.d fa5, fa2, fa5 ; RV32IFD-NEXT: fmsub.d fa0, fa0, fa1, fa5 -; RV32IFD-NEXT: addi sp, sp, 16 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: fmsub_d: @@ -478,14 +474,10 @@ define double @fmsub_d(double %a, double %b, double %c) nounwind { define double @fnmadd_d(double %a, double %b, double %c) nounwind { ; RV32IFD-LABEL: fnmadd_d: ; RV32IFD: # %bb.0: -; RV32IFD-NEXT: addi sp, sp, -16 -; RV32IFD-NEXT: sw zero, 8(sp) -; RV32IFD-NEXT: sw zero, 12(sp) -; RV32IFD-NEXT: fld fa5, 8(sp) +; RV32IFD-NEXT: fcvt.d.w fa5, zero ; RV32IFD-NEXT: fadd.d fa4, fa0, fa5 ; RV32IFD-NEXT: fadd.d fa5, fa2, fa5 ; RV32IFD-NEXT: fnmadd.d fa0, fa4, fa1, fa5 -; RV32IFD-NEXT: addi sp, sp, 16 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: fnmadd_d: @@ -590,14 +582,10 @@ define double @fnmadd_d(double %a, double %b, double %c) nounwind { define double @fnmadd_d_2(double %a, double %b, double %c) nounwind { ; RV32IFD-LABEL: fnmadd_d_2: ; RV32IFD: # %bb.0: -; RV32IFD-NEXT: addi sp, sp, -16 -; RV32IFD-NEXT: sw zero, 8(sp) -; RV32IFD-NEXT: sw zero, 12(sp) -; RV32IFD-NEXT: fld fa5, 8(sp) +; RV32IFD-NEXT: fcvt.d.w fa5, zero ; RV32IFD-NEXT: fadd.d fa4, fa1, fa5 ; RV32IFD-NEXT: fadd.d fa5, fa2, fa5 ; RV32IFD-NEXT: fnmadd.d fa0, fa4, fa0, fa5 -; RV32IFD-NEXT: addi sp, sp, 16 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: fnmadd_d_2: @@ -772,13 +760,9 @@ define double @fnmadd_nsz(double %a, double %b, double %c) nounwind { define double @fnmsub_d(double %a, double %b, double %c) nounwind { ; RV32IFD-LABEL: fnmsub_d: ; RV32IFD: # %bb.0: -; RV32IFD-NEXT: addi sp, sp, -16 -; RV32IFD-NEXT: sw zero, 8(sp) -; RV32IFD-NEXT: sw zero, 12(sp) -; RV32IFD-NEXT: fld fa5, 8(sp) +; RV32IFD-NEXT: fcvt.d.w fa5, zero ; RV32IFD-NEXT: fadd.d fa5, fa0, fa5 ; RV32IFD-NEXT: fnmsub.d fa0, fa5, fa1, fa2 -; RV32IFD-NEXT: addi sp, sp, 16 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: fnmsub_d: @@ -851,13 +835,9 @@ define double @fnmsub_d(double %a, double %b, double %c) nounwind { define double @fnmsub_d_2(double %a, double %b, double %c) nounwind { ; RV32IFD-LABEL: fnmsub_d_2: ; RV32IFD: # %bb.0: -; RV32IFD-NEXT: addi sp, sp, -16 -; RV32IFD-NEXT: sw zero, 8(sp) -; RV32IFD-NEXT: sw zero, 12(sp) -; RV32IFD-NEXT: fld fa5, 8(sp) +; RV32IFD-NEXT: fcvt.d.w fa5, zero ; RV32IFD-NEXT: fadd.d fa5, fa1, fa5 ; RV32IFD-NEXT: fnmsub.d fa0, fa5, fa0, fa2 -; RV32IFD-NEXT: addi sp, sp, 16 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: fnmsub_d_2: @@ -976,14 +956,10 @@ define double @fmadd_d_contract(double %a, double %b, double %c) nounwind { define double @fmsub_d_contract(double %a, double %b, double %c) nounwind { ; RV32IFD-LABEL: fmsub_d_contract: ; RV32IFD: # %bb.0: -; RV32IFD-NEXT: addi sp, sp, -16 -; RV32IFD-NEXT: sw zero, 8(sp) -; RV32IFD-NEXT: sw zero, 12(sp) -; RV32IFD-NEXT: fld fa5, 8(sp) +; RV32IFD-NEXT: fcvt.d.w fa5, zero ; RV32IFD-NEXT: fadd.d fa5, fa2, fa5 ; RV32IFD-NEXT: fmul.d fa4, fa0, fa1 ; RV32IFD-NEXT: fsub.d fa0, fa4, fa5 -; RV32IFD-NEXT: addi sp, sp, 16 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: fmsub_d_contract: @@ -1069,17 +1045,13 @@ define double @fmsub_d_contract(double %a, double %b, double %c) nounwind { define double @fnmadd_d_contract(double %a, double %b, double %c) nounwind { ; RV32IFD-LABEL: fnmadd_d_contract: ; RV32IFD: # %bb.0: -; RV32IFD-NEXT: addi sp, sp, -16 -; RV32IFD-NEXT: sw zero, 8(sp) -; RV32IFD-NEXT: sw zero, 12(sp) -; RV32IFD-NEXT: fld fa5, 8(sp) +; RV32IFD-NEXT: fcvt.d.w fa5, zero ; RV32IFD-NEXT: fadd.d fa4, fa0, fa5 ; RV32IFD-NEXT: fadd.d fa3, fa1, fa5 ; RV32IFD-NEXT: fadd.d fa5, fa2, fa5 ; RV32IFD-NEXT: fmul.d fa4, fa4, fa3 ; RV32IFD-NEXT: fneg.d fa4, fa4 ; RV32IFD-NEXT: fsub.d fa0, fa4, fa5 -; RV32IFD-NEXT: addi sp, sp, 16 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: fnmadd_d_contract: @@ -1204,14 +1176,10 @@ define double @fnmadd_d_contract(double %a, double %b, double %c) nounwind { define double @fnmsub_d_contract(double %a, double %b, double %c) nounwind { ; RV32IFD-LABEL: fnmsub_d_contract: ; RV32IFD: # %bb.0: -; RV32IFD-NEXT: addi sp, sp, -16 -; RV32IFD-NEXT: sw zero, 8(sp) -; RV32IFD-NEXT: sw zero, 12(sp) -; RV32IFD-NEXT: fld fa5, 8(sp) +; RV32IFD-NEXT: fcvt.d.w fa5, zero ; RV32IFD-NEXT: fadd.d fa4, fa0, fa5 ; RV32IFD-NEXT: fadd.d fa5, fa1, fa5 ; RV32IFD-NEXT: fnmsub.d fa0, fa4, fa5, fa2 -; RV32IFD-NEXT: addi sp, sp, 16 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: fnmsub_d_contract: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant.mir index e82d4bcec48b..4db80c6c1141 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-constant.mir @@ -164,10 +164,8 @@ body: | ; RV32-LABEL: name: double_positive_zero ; RV32: liveins: $x10 ; RV32-NEXT: {{ $}} - ; RV32-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x0 - ; RV32-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x0 - ; RV32-NEXT: [[BuildPairF64Pseudo:%[0-9]+]]:fpr64 = BuildPairF64Pseudo [[COPY1]], [[COPY]] - ; RV32-NEXT: $f10_d = COPY [[BuildPairF64Pseudo]] + ; RV32-NEXT: [[FCVT_D_W:%[0-9]+]]:fpr64 = FCVT_D_W $x0, 0 + ; RV32-NEXT: $f10_d = COPY [[FCVT_D_W]] ; RV32-NEXT: PseudoRET implicit $f10_d ; ; RV64-LABEL: name: double_positive_zero From d49aab10bd424f67a0df0d70f653f8deeb498a16 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Mon, 18 Aug 2025 14:01:19 -0400 Subject: [PATCH 13/27] =?UTF-8?q?Revert=20"[AMDGPU][True16][CodeGen]=20use?= =?UTF-8?q?=20vgpr16=20for=20zext=20patterns=20(#1538=E2=80=A6=20(#154163)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 7c53c6162bd43d952546a3ef7d019babd5244c29. This patch hit an issue in hip test. revert and will reopen later --- llvm/lib/Target/AMDGPU/SIInstructions.td | 22 - llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 2 +- .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 12031 ++++++++-------- .../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll | 1204 +- .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 1336 +- .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 2906 ++-- .../CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll | 240 +- .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 5432 +++---- .../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll | 637 +- .../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll | 594 +- .../AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll | 1 - .../atomic_optimizations_global_pointer.ll | 64 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 14 +- .../buffer-fat-pointer-atomicrmw-fadd.ll | 50 +- .../buffer-fat-pointer-atomicrmw-fmax.ll | 42 +- .../buffer-fat-pointer-atomicrmw-fmin.ll | 42 +- .../CodeGen/AMDGPU/calling-conventions.ll | 100 +- llvm/test/CodeGen/AMDGPU/clamp-modifier.ll | 4 +- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 42 +- .../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 5 +- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 106 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 110 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 110 +- .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 106 +- llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll | 2 +- .../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 6 +- llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll | 6 +- llvm/test/CodeGen/AMDGPU/function-args.ll | 253 +- .../AMDGPU/gfx-callable-argument-types.ll | 228 +- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 106 +- .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 110 +- .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 110 +- .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 106 +- llvm/test/CodeGen/AMDGPU/idot4u.ll | 43 +- .../CodeGen/AMDGPU/integer-mad-patterns.ll | 28 +- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 60 +- .../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 68 +- .../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 68 +- .../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 60 +- llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 31 +- llvm/test/CodeGen/AMDGPU/mad.u16.ll | 7 +- llvm/test/CodeGen/AMDGPU/preserve-hi16.ll | 54 +- .../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 6 +- llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll | 126 +- .../test/CodeGen/AMDGPU/vector-reduce-umin.ll | 78 +- 45 files changed, 14147 insertions(+), 12609 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6488fa3dacfb..bd5dfa92a8e4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3056,8 +3056,6 @@ def : GCNPat< } } // AddedComplexity = 1 -foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in -let True16Predicate = p in { def : GCNPat< (i32 (DivergentUnaryFrag i16:$src)), (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src) @@ -3073,26 +3071,6 @@ def : GCNPat< def : GCNPat< (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), (COPY VSrc_b16:$src)>; -} - -let True16Predicate = UseRealTrue16Insts in { -def : GCNPat< - (i32 (DivergentUnaryFrag i16:$src)), - (REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16) ->; - -def : GCNPat< - (i64 (DivergentUnaryFrag i16:$src)), - (REG_SEQUENCE VReg_64, - (REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16), sub0, - (S_MOV_B32 (i32 0)), sub1) ->; - -def : GCNPat< - (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), - (REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16) ->; -} def : GCNPat < (i32 (trunc i64:$a)), diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 637aaf752936..01854c8560ce 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -164,7 +164,7 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_mul_i16_zeroext: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index d03d6a8940b2..0d5f538215f1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -6309,64 +6309,64 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) @@ -6394,50 +6394,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] @@ -6498,50 +6498,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-TRUE16-NEXT: .LBB12_4: ; %end @@ -6549,266 +6549,307 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l ; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l ; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39 ; GFX11-TRUE16-NEXT: s_clause 0x5 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 @@ -15372,63 +15413,63 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 @@ -15442,143 +15483,144 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -15592,660 +15634,746 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -42028,64 +42156,64 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) @@ -42113,50 +42241,50 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] @@ -42200,50 +42328,50 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-TRUE16-NEXT: .LBB36_4: ; %end @@ -42251,266 +42379,307 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l ; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l ; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39 ; GFX11-TRUE16-NEXT: s_clause 0x5 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 @@ -52041,63 +52210,63 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 @@ -52111,143 +52280,144 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -52261,660 +52431,746 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB38_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-TRUE16-NEXT: .LBB38_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -77682,64 +77938,64 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) @@ -77767,50 +78023,50 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] @@ -77879,50 +78135,50 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-TRUE16-NEXT: .LBB56_4: ; %end @@ -77930,266 +78186,307 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l ; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l ; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39 ; GFX11-TRUE16-NEXT: s_clause 0x5 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 @@ -86763,63 +87060,63 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 @@ -86833,143 +87130,144 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -86983,660 +87281,746 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -111416,64 +111800,64 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) @@ -111501,50 +111885,50 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] @@ -111588,50 +111972,50 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-TRUE16-NEXT: .LBB72_4: ; %end @@ -111639,266 +112023,307 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l ; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l ; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39 ; GFX11-TRUE16-NEXT: s_clause 0x5 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 @@ -121414,63 +121839,63 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 @@ -121484,143 +121909,144 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -121634,660 +122060,746 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB74_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2 ; GFX11-TRUE16-NEXT: .LBB74_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -159577,162 +160089,159 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:152 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:44 -; GFX11-TRUE16-NEXT: s_clause 0x7 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:32 +; GFX11-TRUE16-NEXT: s_clause 0x4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:12 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr111_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr109_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 @@ -159741,142 +160250,143 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v104, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v8.l +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v8.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v108.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v110.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v109.h, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v24.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.h, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v30.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v31.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.h, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v32.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.h, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v95.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.h, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v24.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v26.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v31.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v32.h ; GFX11-TRUE16-NEXT: .LBB90_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v18 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v18 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v80, v37, v39 :: v_dual_add_f32 v33, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v70, v37, v39 :: v_dual_add_f32 v33, 0x40c00000, v33 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v33, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v80.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v70.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v36, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v36, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_lshlrev_b32 v17, 16, v17 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -159889,500 +160399,498 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v48, v34, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v50, v17, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v82, v37, v51 :: v_dual_and_b32 v35, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v71, v37, v51 :: v_dual_lshlrev_b32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v82.h -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v71.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v33, v81 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v33, v55 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 ; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v34, v17 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v19, 0x40c00000, v19 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v33, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v34, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v19, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v17 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v19, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v85, v33, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v81.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v33, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v22, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v85.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v84.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v20, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v20, v33, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v86, v20, v33 :: v_dual_add_f32 v35, 0x40c00000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v34, v84 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v86.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v20 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v19, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v20, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v34, v80 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v19, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v37, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v38 :: v_dual_lshlrev_b32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v87 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v83.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v20 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v21, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v82 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v22 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v96, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v22 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v21, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v21, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v86, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v23 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v22 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v96.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v19 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v86.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v34, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 0x40c00000, v23 :: v_dual_lshlrev_b32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v35, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v23, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v98, v33, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v33, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v23, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v34, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_f32 v34, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v97.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v37, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v35, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v99, v34, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v99.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v24, v39, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v36, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v97.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v24, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v26 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v33, v98 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v36, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v87.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v33, v85 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 24, v24 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v26, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 8, v24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v100, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v98, v33, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v26, v26, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v96, v35, v38 :: v_dual_add_f32 v25, 0x40c00000, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v98.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v25, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v26, v26, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v101, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v28, 0x40c00000, v28 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v100.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v102, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v102.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v26, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v35, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v101, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v26, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v26, v33, v28, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v28 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v103, v26, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v101.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v100, v26, v33, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v34, v101 -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v27, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v103.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v25, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v34, v96 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v100.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v99, v25, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v37, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v27, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v112 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v34, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v27, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v27, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v37, 0x40c00000, v37 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v29, 16, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v34, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v99 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v26 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v30, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v112.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v28 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v30, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v113.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v28 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v114, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v32 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v29, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v33, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-TRUE16-NEXT: v_add3_u32 v30, v30, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v29, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v114.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v25 ; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v35, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v26 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v116, v34, v36, vcc_lo +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v103, v34, v38 :: v_dual_and_b32 v38, 0xffff0000, v32 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v102, v33, v39 :: v_dual_add_f32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v29, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v103.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v34, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add3_u32 v30, v30, v37, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 8, v26 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v116.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v113.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v30, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v33, v115 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v25 +; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v33, v102 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v36, v29 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v32, 0x7fff ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v31 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v31, 16, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v30 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v32 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v29 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v32, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v117, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v30 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v31, 16, v31 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v115.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v114, v35, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v31, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v117.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v118, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v119, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 ; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v119.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v116, v33, v37 :: v_dual_and_b32 v35, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v116.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v32, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v32, v33, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v128, v32, v33, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v32, 0xffff, v34, v118 -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v133, v32, v33, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v128.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v129, v31, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v32, 0xffff, v34, v114 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v132, v31, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v37, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v131, v34, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v133.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v32 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v146, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v132 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v35, v38 :: v_dual_add_f32 v36, 0x40c00000, v36 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v131.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v129 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v133, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v35, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v135, v33, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v32 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v146, v34, v36 :: v_dual_add_f32 v37, 0x40c00000, v37 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v31 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v146.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v39 :: v_dual_add_f32 v34, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v144, v33, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v148.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v146.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v34, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v35, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v164.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v7 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 ; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v36, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v133.h -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v33, v135 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v33, v144 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v104, 8, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v150, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v33, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v151, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v165.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v161, v35, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v150.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v166, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v166.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v180, v33, v37 :: v_dual_add_f32 v35, 0x40c00000, v35 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v180.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v6, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v6, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v35, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v33, v8, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v177, v6, v33, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v6, v33, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v34, v151 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v5, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v37, v36 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v36, 16, v10 -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v177.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v34, v161 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v179.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v178, v5, v38 :: v_dual_add_f32 v33, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v37, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v33, 0x7fff ; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v178 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v47, v35, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 24, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v43, v35, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v36, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v36, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v47.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v44, v7, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v9, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v43.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v41, v7, v37, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v10, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v44, v35, v38 :: v_dual_and_b32 v39, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v41, v35, v38 :: v_dual_lshlrev_b32 v10, 16, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v41.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v44.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v37, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v37 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v35, v44 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v35, v41 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v37, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v59, v38, v50, vcc_lo -; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_lshlrev_b32 v7, 16, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v61, v38, v50 :: v_dual_add_f32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v51 :: v_dual_lshlrev_b32 v14, 16, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v61.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v12, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add3_u32 v48, v48, v12, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v59.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v73, v35, v49, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v39 ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v14, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v62, v48, v52, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v9, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add3_u32 v48, v48, v12, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v57, v48, v52, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v62 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v9, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v14, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v57 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v36, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v11 ; GFX11-TRUE16-NEXT: v_add3_u32 v11, v35, v37, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v36, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v73.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 8, v12 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v35, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v13 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v39 :: v_dual_cndmask_b32 v92, v37, v38 +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v36, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v89, v37, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v35, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 @@ -160390,18 +160898,18 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14 ; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v89, v37, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v77, v37, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_add3_u32 v14, v49, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v91, v39, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v78, v39, v48, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v73.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v78.h ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v14, v35 :: v_dual_add_f32 v14, 0x40c00000, v37 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v15 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v13, 0x7fff @@ -160411,7 +160919,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v108, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v104, v35, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v37, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 @@ -160419,366 +160927,405 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v37 ; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v107, v13, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v93, v13, v49, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v48, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14 ; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v15, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v110, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v95, v39, v51, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v108.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v104.h ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v50, v15, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v91.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v92.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v109, v35, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v89.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v38, v77 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v92, v35, v48, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v110.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v107.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v38, v89 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v95.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v93.h ; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v39, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v14 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v36, v9 -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v35, v109 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v14 +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v35, v92 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] ; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v13 ; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v37, v7 ; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v34, v33 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v7 ; GFX11-TRUE16-NEXT: .LBB90_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v131.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v111.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v108.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v129.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v1.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v128.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v107.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v106.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v105.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v146.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v104.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v95.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v135.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v133.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v93.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v166.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v88.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v78.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v150.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v151.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v76.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v6, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v43.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v74.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v164.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v1.h, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v105.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v94.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v91.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v148.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v180.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v90.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v144.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v4.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v88.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v5.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v47.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v76.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v58.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v75.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v161.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v179.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v72.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v6.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v62.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v6, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v177.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v63.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v178.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v60.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v180.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, v6, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v73.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v57.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v6, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v41.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v47.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v44.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v45.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v6, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v92.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v40.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v6, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v59.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v182.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v6, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v89.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v108.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v176.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v6, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v163.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v91.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v165.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v6, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v110.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v109.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v6, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v107.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v6, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v82.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v94.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v6, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v79.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v90.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v6, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v77.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v6, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v72.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v75.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v6, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v61.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v87.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v6, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v56.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v58.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v6, v19 -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v46.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v98.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v6, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v183.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v42.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v6, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v102.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v181.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v6, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v167.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v100.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v179.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v6, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v164.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v112.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v6, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.l, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v6, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v116.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v6, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.l, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v114.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v6, v27 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v6, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v117.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v73.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v178.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v59.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v56.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v8.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v44.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v43.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v89.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v41.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v42.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v10.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v61.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v183.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v11.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v104.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v176.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v166.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v16, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v167.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v57.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v78.h ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[66:69], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v6, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.l, v30.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.h +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v12.l, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v77.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v95.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v150.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, v18, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v93.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v149.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, v18, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v79.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v92.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v18, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v74.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v18, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v46.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v63.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v62.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v60.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v20, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v45.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v19, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v40.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v19, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v182.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v181.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v13.h, v19.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v177.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v22, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v13.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v101.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v163.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v22, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v26, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v145.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v25, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v135.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v25, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v113.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v13.h, v25.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v28, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v13.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v128.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.l, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v115.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v13.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v13.h, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v32, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v114.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v13.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v6, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v14 ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[64:67], off offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[19:22], off offset:80 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:96 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[27:30], off offset:112 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:136 -; GFX11-TRUE16-NEXT: s_clause 0x7 -; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:152 -; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:136 +; GFX11-TRUE16-NEXT: s_clause 0x4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:156 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -184755,69 +185302,69 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 @@ -184828,69 +185375,69 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 ; GFX11-TRUE16-NEXT: .LBB94_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_4 @@ -184899,364 +185446,405 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 ; GFX11-TRUE16-NEXT: .LBB94_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v166.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v165.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v164.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v68, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v51 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67 ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v11.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v49, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v49, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v48, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v128.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v25.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v27.l, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v28.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v30.l, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v51 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v31.l, v31.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v55.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v32.l, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v51 ; GFX11-TRUE16-NEXT: s_clause 0x5 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 @@ -207467,69 +208055,69 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 @@ -207540,69 +208128,69 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 ; GFX11-TRUE16-NEXT: .LBB98_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_4 @@ -207611,364 +208199,405 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v166.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v165.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v164.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v68, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v51 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67 ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v11.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v49, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v49, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v48, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v128.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v25.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v27.l, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v28.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v30.l, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v51 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v31.l, v31.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v55.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v32.l, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v51 ; GFX11-TRUE16-NEXT: s_clause 0x5 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 21ec3ee1996a..3e96ab1d597d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -4118,19 +4118,19 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -4144,95 +4144,103 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8584,19 +8592,19 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -8610,95 +8618,103 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12666,19 +12682,19 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -12692,95 +12708,103 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16358,19 +16382,19 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -16384,95 +16408,103 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19779,19 +19811,19 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -19805,95 +19837,103 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -22685,19 +22725,19 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -22711,95 +22751,103 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -24896,19 +24944,19 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -24922,95 +24970,103 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index 38302a75fe26..f8ffaa456c2b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -6296,31 +6296,32 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 @@ -6332,175 +6333,194 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13315,31 +13335,32 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 @@ -13351,175 +13372,194 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19852,31 +19892,32 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 @@ -19888,175 +19929,194 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -25879,31 +25939,32 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 @@ -25915,175 +25976,194 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 436b1a038b27..0cefbc1c2dee 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -2966,20 +2966,20 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -2995,17 +2995,17 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow @@ -3029,17 +3029,17 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB12_4: ; %end @@ -3047,93 +3047,105 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8: @@ -5026,49 +5038,48 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v33.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3 @@ -5081,217 +5092,245 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v0.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v1.h, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v27, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v2.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v27, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v4.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v0.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v1.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v2.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v3.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v4.l, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v14.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v5.l, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v13.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v6.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v7.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v8.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v27, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v7.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v9.l, v10.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v27, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v8.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v27.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v27, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v9.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v27.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v27, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v25 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v22.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v23.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v25.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v19.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v20.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v21.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v21.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v25.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v18.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v25, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v15.h, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v19.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v16.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v17.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v19.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.h, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v25, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v13.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v13.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v14.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v15.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v13.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v12.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v27 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v25, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.h, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v25, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v11.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v27 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9912,20 +9951,20 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -9941,17 +9980,17 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB32_2: ; %Flow @@ -9971,17 +10010,17 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB32_4: ; %end @@ -9989,93 +10028,105 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8: @@ -11986,49 +12037,48 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v33.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_3 @@ -12041,217 +12091,245 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB34_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v0.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v1.h, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v27, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v2.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v27, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v4.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v0.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v1.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v2.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v3.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v4.l, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v14.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v5.l, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v13.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v6.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v7.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v8.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v27, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v7.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v9.l, v10.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v27, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v8.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v27.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v27, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v9.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v27.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v27, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v25 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: .LBB34_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v22.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v23.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v25.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v19.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v20.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v21.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v21.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v25.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v18.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v25, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v15.h, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v19.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v16.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v17.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v19.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.h, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v25, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v13.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v13.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v14.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v15.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v13.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v12.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v27 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v25, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.h, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v25, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v11.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v27 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16280,20 +16358,20 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -16309,17 +16387,17 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow @@ -16343,17 +16421,17 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB48_4: ; %end @@ -16361,93 +16439,105 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v20i16_to_v40i8: @@ -22389,20 +22479,20 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -22418,17 +22508,17 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB60_2: ; %Flow @@ -22452,17 +22542,17 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB60_4: ; %end @@ -22470,93 +22560,105 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v20f16_to_v40i8: @@ -28757,51 +28859,50 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_3 @@ -28814,216 +28915,245 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB72_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v2.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v3.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v0.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v1.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v2.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v3.l, v23.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v21.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.l, v19.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v19.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v6.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v7.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v16.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v9.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v11, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v8.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v9.l, v16.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v10 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-TRUE16-NEXT: .LBB72_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v30.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v25.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v28.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v29.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v26.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v28.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v29.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v26.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v23.h, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v27.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v25.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v27.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v23.h, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v22.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v19.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v19.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v21.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v22.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v19.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v18.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v11 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v16.h, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v17.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -30778,20 +30908,20 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -30807,17 +30937,17 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB74_2: ; %Flow @@ -30836,17 +30966,17 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB74_4: ; %end @@ -30854,93 +30984,105 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v5f64_to_v40i8: @@ -32868,51 +33010,50 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_3 @@ -32925,216 +33066,245 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB76_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v2.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v3.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v0.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v1.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v2.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v3.l, v23.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v21.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.l, v19.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v19.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v6.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v7.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v16.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v9.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v11, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v8.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v9.l, v16.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v10 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB76_2 ; GFX11-TRUE16-NEXT: .LBB76_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v30.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v25.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v28.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v29.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v26.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v28.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v29.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v26.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v23.h, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v27.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v25.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v27.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v23.h, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v22.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v19.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v19.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v21.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v22.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v19.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v18.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v11 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v16.h, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v17.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -34904,20 +35074,20 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -34933,17 +35103,17 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB78_2: ; %Flow @@ -34970,17 +35140,17 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB78_4: ; %end @@ -34988,93 +35158,105 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v5i64_to_v40i8: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 8e30ee659a26..48c9b8775a47 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -2257,8 +2257,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -2273,17 +2273,19 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true @@ -2293,14 +2295,16 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -4502,8 +4506,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -4518,17 +4522,19 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true @@ -4538,14 +4544,16 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -6459,8 +6467,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -6475,17 +6483,19 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true @@ -6495,14 +6505,16 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -8104,8 +8116,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -8120,17 +8132,19 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true @@ -8140,14 +8154,16 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -9463,8 +9479,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -9479,17 +9495,19 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true @@ -9499,14 +9517,16 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -10173,8 +10193,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -10189,17 +10209,19 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true @@ -10209,14 +10231,16 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 35d135b12396..5aac06a7f3a2 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -8768,32 +8768,32 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -8812,26 +8812,26 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow @@ -8864,26 +8864,26 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB24_4: ; %end @@ -8891,135 +8891,156 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -12449,15 +12470,15 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 @@ -12471,82 +12492,84 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow @@ -12558,338 +12581,384 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -23519,32 +23588,32 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -23563,26 +23632,26 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow @@ -23607,26 +23676,26 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB48_4: ; %end @@ -23634,135 +23703,156 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -27323,15 +27413,15 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 @@ -27345,82 +27435,84 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow @@ -27432,338 +27524,384 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -37778,32 +37916,32 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -37822,26 +37960,26 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB68_2: ; %Flow @@ -37879,26 +38017,26 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB68_4: ; %end @@ -37906,135 +38044,156 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -41469,15 +41628,15 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 @@ -41491,82 +41650,84 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow @@ -41578,338 +41739,384 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -51088,32 +51295,32 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -51132,26 +51339,26 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB84_2: ; %Flow @@ -51176,26 +51383,26 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB84_4: ; %end @@ -51203,135 +51410,156 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -54761,15 +54989,15 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 @@ -54783,82 +55011,84 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow @@ -54870,338 +55100,384 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -64297,32 +64573,32 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -64341,26 +64617,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB96_2: ; %Flow @@ -64393,26 +64669,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB96_4: ; %end @@ -64420,135 +64696,156 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -76404,32 +76701,32 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -76448,26 +76745,26 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB104_2: ; %Flow @@ -76500,26 +76797,26 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB104_4: ; %end @@ -76527,135 +76824,156 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -85374,59 +85692,59 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -85439,302 +85757,307 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v8.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v16.h ; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v17, 0x40c00000, v17 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v17, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v2 ; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v17, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v20, v22, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v20, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v1, 0x40c00000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v26.h -; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v21, v23, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v21, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v27 +; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v20, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v28.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v17, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v20, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v18, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v18, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v6, 16, v6 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v6, 16, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v17, v23, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v21, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v29, v17, v23 ; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v29.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v18, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v18, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v23, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v17, v30 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v20, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v23 :: v_dual_add_f32 v18, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v17, v29 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v19, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v19, v21, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v17, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v19, v21, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v17, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v20, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v19, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v19, v22 :: v_dual_and_b32 v20, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_lshlrev_b32 v5, 16, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v33.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v20, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v17, v21 :: v_dual_add_f32 v19, 0x40c00000, v19 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v17, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v8, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v34.h -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v6, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v19, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v17, v8, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v8 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v19, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v19, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v6, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v18, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v5, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v18, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v5, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v21, v20 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v35.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v36 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v34 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v20, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v19, v21, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v19, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v23 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v20, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v23 :: v_dual_add_f32 v10, 0x40c00000, v10 ; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v10, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v10, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v7, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v7, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v19, v22, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v48, v19, v22 :: v_dual_lshlrev_b32 v7, 16, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v37.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v21, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v19, v39 +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v19, v48 ; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v12, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v22, v22, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v10 ; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v12, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v52, v22, v48 :: v_dual_add_f32 v9, 0x40c00000, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v10 -; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v9, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v22, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v9, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v19, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v54.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v65, v19, v25, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v21, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v14, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v24, v50, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v52, v24, v50 :: v_dual_add_f32 v9, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v21, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v52 +; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v9 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v53 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v20, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v11 ; GFX11-TRUE16-NEXT: v_add3_u32 v11, v19, v21, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v9, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 8, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v20, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v55.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v65.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v21, v23, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v13 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v19, 16, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v20, v9 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v23 :: v_dual_cndmask_b32 v70, v21, v22 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v21, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v21, v24, v19, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v14 ; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v21, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v66, v21, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v14, v25, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v23, v24, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v23, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v20, v9 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v69.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v68.h ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v14, v19 :: v_dual_add_f32 v14, 0x40c00000, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v15 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v23, v13, 0x7fff @@ -85744,42 +86067,42 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 ; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v14, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v23, v23, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v13, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v13, v25, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v24, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v15, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v15, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v23, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v23, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v85.h -; GFX11-TRUE16-NEXT: v_add3_u32 v13, v48, v15, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v70.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v22, v67 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v19, v24, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v86.h +; GFX11-TRUE16-NEXT: v_add3_u32 v13, v37, v15, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v71.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v22, v66 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v19, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v83.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v87.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v82.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v85.h ; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v23, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 24, v14 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v19, v86 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v19, v81 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v9 ; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v13 ; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v21, v7 ; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v18, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] @@ -85788,142 +86111,159 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v7 ; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v28.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v112.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v102.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.h, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v99.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v99.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v24 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v96.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v81.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v87.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v71.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v19 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v13.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v15.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v51.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v24 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v66.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 4c485768bcbb..6fe66655de3d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -3065,12 +3065,13 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -3084,53 +3085,61 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6205,12 +6214,13 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -6224,53 +6234,61 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9045,12 +9063,13 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -9064,53 +9083,61 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11576,12 +11603,13 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -11595,53 +11623,61 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13793,12 +13829,13 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -13812,53 +13849,61 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15610,12 +15655,13 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -15629,53 +15675,61 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16912,12 +16966,13 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -16931,53 +16986,61 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 879e8520d8e1..e5245f7bd71d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -1102,16 +1102,15 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -1126,74 +1125,80 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB6_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v3.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v7 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-TRUE16-NEXT: .LBB6_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.l, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4236,16 +4241,15 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -4260,74 +4264,80 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v3.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v7 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.l, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6875,16 +6885,16 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -6899,74 +6909,80 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB36_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-TRUE16-NEXT: .LBB36_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8635,16 +8651,16 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -8659,74 +8675,80 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB40_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: .LBB40_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10043,16 +10065,16 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -10067,74 +10089,80 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB44_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: .LBB44_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll index d6922bc09ff0..89fc6c062c29 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc %s -o %t.o -mcpu=gfx1030 -filetype=obj -O0 ; RUN: llvm-debuginfo-analyzer %t.o --print=all --attribute=all | FileCheck %s diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 1d3368b036d0..4cc39d93854a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -9022,12 +9022,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 ; GFX1164-TRUE16-NEXT: .LBB15_2: ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l ; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -9100,12 +9101,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 ; GFX1132-TRUE16-NEXT: .LBB15_2: ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l ; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -9178,12 +9180,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 ; GFX1264-TRUE16-NEXT: .LBB15_2: ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l ; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -9256,12 +9259,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 ; GFX1232-TRUE16-NEXT: .LBB15_2: ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l ; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -9658,11 +9662,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1164-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 ; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -9784,11 +9789,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1132-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 ; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -9910,12 +9916,13 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1264-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 ; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -10041,12 +10048,13 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1232-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 ; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -10726,15 +10734,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 -; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1164-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -10820,14 +10828,14 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1132-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 -; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1132-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -10912,15 +10920,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 -; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1264-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -11006,14 +11014,14 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1232-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 -; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1232-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 10e523d1a0cf..505ddc8c3b57 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -37774,10 +37774,9 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX11TRUE16-LABEL: v_uitofp_i16_to_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v1 +; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -40751,11 +40750,12 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; ; GFX11TRUE16-LABEL: s_select_bf16: ; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v1.l, vcc_lo +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo +; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 0ceb9019eb99..f4b432dce8c8 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -3443,14 +3443,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -3568,13 +3569,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -3882,14 +3884,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -4004,13 +4007,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -4324,14 +4328,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 @@ -4551,14 +4556,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index cad4c39eaf39..6f1675edbe58 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -2512,16 +2512,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -2640,19 +2640,20 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -2972,16 +2973,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -3097,19 +3098,20 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -3435,16 +3437,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3670,16 +3672,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 6275afd2c699..acb27be1846b 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -2512,16 +2512,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -2640,19 +2640,20 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -2972,16 +2973,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -3097,19 +3098,20 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -3435,16 +3437,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3670,16 +3672,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 2db7b28c7de9..ff80250bfc88 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -2745,15 +2745,6 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; ; GFX11-TRUE16-LABEL: amdgpu_cs_v32i1: ; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, v26.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 1, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 1 -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, v22.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 1, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, v20.l, 1 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, v18.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 1, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, v16.l, 1 ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, v10.l, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 1, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, v8.l, 1 @@ -2763,18 +2754,6 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v2.l, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 1, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 1 -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, v30.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 1, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, v28.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 3, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 2, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v24.l, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 3, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 2, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v20.l, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 3, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 2, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, v14.l, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 1, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, v12.l, 1 @@ -2787,15 +2766,15 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 3, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 2, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 3, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 2, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, v22.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v22.l, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v18.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, v26.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 1, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 1 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, v22.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 1, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, v20.l, 1 +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, v18.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 1, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, v16.l, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 3, v15.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 2, v14.l ; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v12.l, v13.l @@ -2805,42 +2784,65 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v28.h, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v21.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, v30.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 1, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, v28.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 3, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 2, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 3, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 2, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v20.l, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 3, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 2, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v17.l ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.h, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, v8.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, v19.l, 15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 4, v16.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v15.h, 15 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 3, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 2, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v28.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v23.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, v16.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v15.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, v1.l, 15 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 4, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 12, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v30.h, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v24.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v14.h, v18.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v16.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 12, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v20.h, 15 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 4, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v1.h, 15 ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 12, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll index ccdc0b1bf43c..b9caf8e80bcd 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -1561,10 +1561,10 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 clamp ; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 26f204f29f5a..b5bc09a1684e 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -946,9 +946,9 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { ; GFX11-TRUE16-LABEL: v_uitofp_i8_to_f64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1770,38 +1770,40 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX11-TRUE16-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, 9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff00, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff00, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.h ; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x900, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.h ; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x900, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x900, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x900, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 ; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_store_b128 v6, v[0:3], s[0:1] -; GFX11-TRUE16-NEXT: global_store_b32 v6, v4, s[2:3] +; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[0:1] +; GFX11-TRUE16-NEXT: global_store_b32 v5, v4, s[2:3] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: load_v4i8_to_v4f32_2_uses: diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index c5db7a33f70e..b0439b1f7968 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -2536,13 +2536,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo ; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, 15 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 22dd66118837..8c7d5cffe39d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -8410,12 +8410,13 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8528,12 +8529,13 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -8783,12 +8785,13 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8905,12 +8908,13 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -9167,12 +9171,13 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -9290,12 +9295,13 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -9551,11 +9557,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -9665,11 +9671,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -9911,11 +9917,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -10029,11 +10035,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -10282,11 +10288,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -10401,11 +10407,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -10645,8 +10651,8 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -10729,8 +10735,8 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -10919,9 +10925,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -11007,9 +11014,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -11212,12 +11220,13 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -11336,12 +11345,13 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -11600,11 +11610,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -11720,11 +11730,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 1dc45179c74c..56ad91dd59ff 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -6043,14 +6043,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6168,14 +6168,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6438,14 +6438,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6570,14 +6570,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -6847,14 +6847,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6980,14 +6980,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -7254,12 +7254,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7375,12 +7376,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -7636,12 +7638,13 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7764,12 +7767,13 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -8032,12 +8036,13 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8161,12 +8166,13 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -8418,11 +8424,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8513,11 +8519,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8722,9 +8728,10 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8813,9 +8820,10 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -9027,14 +9035,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -9161,14 +9169,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -9440,12 +9448,13 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -9570,12 +9579,13 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 5d26293e7009..f0083bd23660 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -6043,14 +6043,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6168,14 +6168,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6438,14 +6438,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6570,14 +6570,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -6847,14 +6847,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6980,14 +6980,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -7254,12 +7254,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7375,12 +7376,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -7636,12 +7638,13 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7764,12 +7767,13 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -8032,12 +8036,13 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8161,12 +8166,13 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -8418,11 +8424,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8513,11 +8519,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8722,9 +8728,10 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8813,9 +8820,10 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -9027,14 +9035,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -9161,14 +9169,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -9440,12 +9448,13 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -9570,12 +9579,13 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index d12a7f973158..3ee0bb2122ab 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -5855,12 +5855,13 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5973,12 +5974,13 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6228,12 +6230,13 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6350,12 +6353,13 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6612,12 +6616,13 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6735,12 +6740,13 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6996,11 +7002,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7110,11 +7116,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -7356,11 +7362,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7474,11 +7480,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -7727,11 +7733,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7846,11 +7852,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -8090,9 +8096,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8178,9 +8185,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8374,8 +8382,8 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8458,8 +8466,8 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8657,12 +8665,13 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -8781,12 +8790,13 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -9045,11 +9055,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -9165,11 +9175,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll index 899cc8940544..9c4901eb19f3 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll @@ -4238,7 +4238,7 @@ define amdgpu_ps i32 @s_mul_32_f16(half inreg %x, half inreg %y) { ; GFX11-GISEL-TRUE16-LABEL: s_mul_32_f16: ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: v_mul_f16_e64 v0.l, 0x5000, s0 -; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll index a859cc91b7fd..f09c25767648 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll @@ -644,10 +644,11 @@ define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind { ; GFX11-TRUE16-LABEL: fmul_pow_mul_max_pow2: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, 2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mul_f64 v[0:1], 0x40080000, v[0:1] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1193,12 +1194,13 @@ define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind { ; GFX11-TRUE16-LABEL: fmul_pow_shl_cnt_safe: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0xff5f3992 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0x7befffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index 40d276539554..c52fb6197e3e 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -4372,13 +4372,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; GFX11-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2 ; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-GISEL-TRUE16-NEXT: s_endpgm ; @@ -4606,13 +4607,14 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; GFX11-GISEL-TRUE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, |s2| ; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-GISEL-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 3c41cc43a089..95e28a37f5ee 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -1107,19 +1107,21 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 { ; GFX11-TRUE16-LABEL: void_func_v4i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1188,20 +1190,22 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 { ; GFX11-TRUE16-LABEL: void_func_v5i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 4 -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l ; GFX11-TRUE16-NEXT: buffer_store_b8 v4, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1281,27 +1285,29 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 { ; GFX11-TRUE16-LABEL: void_func_v8i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v5.h, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v0.h, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v4 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1416,44 +1422,47 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 { ; GFX11-TRUE16-LABEL: void_func_v16i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.h, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v10.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v8.h, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v5.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v10.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v4, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v14 -; GFX11-TRUE16-NEXT: buffer_store_b128 v[5:8], off, s[0:3], 0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v12 +; GFX11-TRUE16-NEXT: buffer_store_b128 v[6:9], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: void_func_v16i8: @@ -1649,77 +1658,83 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v31, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.h, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v32 ; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v6.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v0.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v6.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v11.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v8.h, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v0.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v7.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 16 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.h, v5.h ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v31.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.l, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v14, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v5.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v9.l, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v10.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v11, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.l, v8.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v32 ; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 919464a93674..2fdc1a885486 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -4896,22 +4896,23 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 @@ -5155,29 +5156,30 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v6 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_store_b8 v[0:1], v4, off ; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v2, off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 -; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 -; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 -; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1 @@ -5439,34 +5441,36 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v5 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[1:2], off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 -; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 -; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1 @@ -5906,77 +5910,85 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v19, v34 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v3.h, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v12.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v2, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v6.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v19.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v6, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v7, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v6.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v6, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v13 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[0:3], off -; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[5:8], off +; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[6:9], off +; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[2:5], off ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 9c1f9d21b9da..1f74fbdc46e9 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -8275,12 +8275,13 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8393,12 +8394,13 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -8698,12 +8700,13 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8820,12 +8823,13 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -9134,12 +9138,13 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -9257,12 +9262,13 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -9570,11 +9576,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -9684,11 +9690,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -9979,11 +9985,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -10097,11 +10103,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -10400,11 +10406,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -10519,11 +10525,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -10813,9 +10819,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -10901,9 +10908,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -11136,8 +11144,8 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -11220,8 +11228,8 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -11456,12 +11464,13 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -11580,12 +11589,13 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -11896,11 +11906,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -12016,11 +12026,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index f7cc0709109f..faa74fef2be2 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -4467,14 +4467,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4592,14 +4592,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -4912,14 +4912,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5044,14 +5044,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -5373,14 +5373,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5506,14 +5506,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -5832,12 +5832,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5953,12 +5954,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -6263,12 +6265,13 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6391,12 +6394,13 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -6709,12 +6713,13 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6838,12 +6843,13 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -7145,11 +7151,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7240,11 +7246,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7488,9 +7494,10 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7579,9 +7586,10 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7830,14 +7838,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -7964,14 +7972,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -8295,12 +8303,13 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -8425,12 +8434,13 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index b81af1fc9233..a46b0129b79e 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -4467,14 +4467,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4592,14 +4592,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -4912,14 +4912,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5044,14 +5044,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -5373,14 +5373,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5506,14 +5506,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -5832,12 +5832,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5953,12 +5954,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -6263,12 +6265,13 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6391,12 +6394,13 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -6709,12 +6713,13 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6838,12 +6843,13 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -7145,11 +7151,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7240,11 +7246,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7488,9 +7494,10 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7579,9 +7586,10 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7830,14 +7838,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -7964,14 +7972,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -8295,12 +8303,13 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -8425,12 +8434,13 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index b8762d13e132..053efdcb7626 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -5221,12 +5221,13 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5339,12 +5340,13 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -5644,12 +5646,13 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5766,12 +5769,13 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -6080,12 +6084,13 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6203,12 +6208,13 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -6516,11 +6522,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6630,11 +6636,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -6925,11 +6931,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7043,11 +7049,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -7346,11 +7352,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7465,11 +7471,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -7759,9 +7765,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7847,9 +7854,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -8082,8 +8090,8 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8166,8 +8174,8 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -8402,12 +8410,13 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -8526,12 +8535,13 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -8842,11 +8852,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -8962,11 +8972,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 305461ed6b20..7ebd69204d87 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1693,11 +1693,12 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-DL-TRUE16-NEXT: v_perm_b32 v1, v5, v5, 0xc0c0302 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v3.l, v0.l ; GFX11-DL-TRUE16-NEXT: v_perm_b32 v2, v4, v4, 0xc0c0302 +; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_dot4_u32_u8 v0, v2, v1, v0 ; GFX11-DL-TRUE16-NEXT: global_store_b16 v6, v0, s[4:5] ; GFX11-DL-TRUE16-NEXT: s_endpgm @@ -2723,32 +2724,32 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3] ; GFX11-DL-TRUE16-NEXT: global_load_d16_u8 v0, v5, s[4:5] ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v3 -; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v4 ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.h, 8, v3.l -; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.h, 8, v4.l +; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.l, 8, v4.l +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v4 ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l -; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v2.l, v2.l, v6.l +; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.l +; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h +; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v2.l, v6.l ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 -; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.h ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v2.l -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.h -; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v1.l, v2.l -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v1, v7, v6 +; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h +; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.h +; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v0.h, v1.l ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.h, v4.h, v0.l -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX11-DL-TRUE16-NEXT: global_store_b8 v5, v0, s[4:5] ; GFX11-DL-TRUE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 31b6b533866d..742d87f099ce 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -1715,9 +1715,9 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16: @@ -1745,7 +1745,8 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l ; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h -; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16: @@ -1776,9 +1777,9 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l ; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l -; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16: @@ -1814,7 +1815,8 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l ; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h -; GFX1200-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16: @@ -9361,9 +9363,9 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2: @@ -9407,7 +9409,8 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l ; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h -; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2: @@ -9454,9 +9457,9 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l ; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l -; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2: @@ -9508,7 +9511,8 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l ; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h -; GFX1200-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index c1a32aafbc71..a42c71c4849b 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -1259,12 +1259,13 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -1370,12 +1371,13 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -1644,12 +1646,13 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1760,12 +1763,13 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -2040,12 +2044,13 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -2148,12 +2153,13 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -2413,11 +2419,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2525,11 +2531,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2789,9 +2795,10 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2875,9 +2882,10 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -3087,8 +3095,8 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -3169,8 +3177,8 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 739e86d1928b..8351d2805756 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -803,14 +803,14 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -918,14 +918,14 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -1199,14 +1199,14 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1319,14 +1319,14 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1606,14 +1606,14 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1718,14 +1718,14 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1990,12 +1990,13 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2106,12 +2107,13 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2377,11 +2379,11 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, 4.0, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2467,11 +2469,11 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, 4.0, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2686,9 +2688,10 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, 4.0, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -2772,9 +2775,10 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, 4.0, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 6da80262951e..0c4aca88b378 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -803,14 +803,14 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -918,14 +918,14 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -1199,14 +1199,14 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1319,14 +1319,14 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1606,14 +1606,14 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1718,14 +1718,14 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1990,12 +1990,13 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2106,12 +2107,13 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2377,11 +2379,11 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, 4.0, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2467,11 +2469,11 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, 4.0, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2686,9 +2688,10 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, 4.0, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -2772,9 +2775,10 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, 4.0, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 786989cc9fb5..37310b614c0d 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -1721,12 +1721,13 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -1832,12 +1833,13 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -2106,12 +2108,13 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -2222,12 +2225,13 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -2502,12 +2506,13 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -2610,12 +2615,13 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -2875,11 +2881,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2987,11 +2993,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -3251,9 +3257,10 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -3337,9 +3344,10 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -3549,8 +3557,8 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -3631,8 +3639,8 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index eab92668c536..811e25587d3d 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -2382,22 +2382,13 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr } define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 { -; SDAG-GFX1100-TRUE16-LABEL: mixlo_zext: -; SDAG-GFX1100-TRUE16: ; %bb.0: -; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; SDAG-GFX1100-FAKE16-LABEL: mixlo_zext: -; SDAG-GFX1100-FAKE16: ; %bb.0: -; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 -; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX1100-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: mixlo_zext: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: mixlo_zext: ; GFX900: ; %bb.0: @@ -2427,14 +2418,6 @@ define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 { ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX1100-LABEL: mixlo_zext: -; GISEL-GFX1100: ; %bb.0: -; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 -; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX1100-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: mixlo_zext: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll index fbf8011fd40c..ef80323a98ec 100644 --- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll @@ -179,7 +179,8 @@ define i32 @v_mad_u16_zext(i16 %arg0, i16 %arg1, i16 %arg2) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_mad_u16_zext: @@ -221,9 +222,9 @@ define i64 @v_mad_u16_zext64(i16 %arg0, i16 %arg1, i16 %arg2) { ; GFX11-TRUE16-LABEL: v_mad_u16_zext64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_mad_u16_zext64: diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll index 79910af5c043..3ce09475c094 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll @@ -374,7 +374,7 @@ define i32 @shl_i16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shl_i16_zext_i32: @@ -412,7 +412,7 @@ define i32 @lshr_i16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: lshr_i16_zext_i32: @@ -450,7 +450,7 @@ define i32 @ashr_i16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: ashr_i16_zext_i32: @@ -488,7 +488,7 @@ define i32 @add_u16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: add_u16_zext_i32: @@ -526,7 +526,7 @@ define i32 @sub_u16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: sub_u16_zext_i32: @@ -564,7 +564,7 @@ define i32 @mul_lo_u16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: mul_lo_u16_zext_i32: @@ -602,7 +602,7 @@ define i32 @min_u16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: min_u16_zext_i32: @@ -641,7 +641,7 @@ define i32 @min_i16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: min_i16_zext_i32: @@ -680,7 +680,7 @@ define i32 @max_u16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: max_u16_zext_i32: @@ -719,7 +719,7 @@ define i32 @max_i16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: max_i16_zext_i32: @@ -758,7 +758,7 @@ define i32 @zext_fadd_f16(half %x, half %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: zext_fadd_f16: @@ -797,10 +797,8 @@ define i32 @zext_fma_f16(half %x, half %y, half %z) { ; GFX11-TRUE16-LABEL: zext_fma_f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v0.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: zext_fma_f16: @@ -840,7 +838,7 @@ define i32 @zext_div_fixup_f16(half %x, half %y, half %z) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: zext_div_fixup_f16: @@ -882,7 +880,7 @@ define i32 @zext_fptrunc_f16(float %x) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: zext_fptrunc_f16: @@ -926,20 +924,12 @@ define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: zext_fptrunc_fma_f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: zext_fptrunc_fma_f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: zext_fptrunc_fma_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fma = call float @llvm.fma.f32(float %x, float %y, float %z) %fptrunc = fptrunc float %fma to half %cast = bitcast half %fptrunc to i16 @@ -950,5 +940,3 @@ define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) { declare half @llvm.amdgcn.div.fixup.f16(half, half, half) declare half @llvm.fma.f16(half, half, half) declare float @llvm.fma.f32(float, float, float) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 91c88ec5e718..21aa40d69998 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -1528,9 +1528,10 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 64 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-SDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-SDAG-TRUE16-NEXT: s_endpgm ; @@ -1559,9 +1560,10 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] -; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xffc0, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll index 334215125f58..30ed6ae5484c 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll @@ -300,15 +300,17 @@ define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v4i8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -346,15 +348,17 @@ define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -514,19 +518,21 @@ define i8 @test_vector_reduce_add_v8i8(<8 x i8> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v8i8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, v6.l ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v3.l, v7.l ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v5.l ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v2.l, v6.l +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l @@ -575,19 +581,21 @@ define i8 @test_vector_reduce_add_v8i8(<8 x i8> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, v6.l ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v3.l, v7.l ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v5.l ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v2.l, v6.l +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l @@ -824,25 +832,28 @@ define i8 @test_vector_reduce_add_v16i8(<16 x i8> %v) { ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v5.l, v13.l ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v9.l -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v5.l, v7.l, v15.l -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v6.l, v14.l +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.h, v7.l, v15.l +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.l, v3.l, v11.l +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.h, v6.l, v14.l ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.l, v10.l -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.h, v3.l, v11.l -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.l, v4.l, v12.l -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v8.l ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.h, v5.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, v12.l +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v3.l, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v8.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v2.l, v2.h +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -911,25 +922,28 @@ define i8 @test_vector_reduce_add_v16i8(<16 x i8> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v5.l, v13.l ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v9.l -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v5.l, v7.l, v15.l -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v6.l, v14.l +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.h, v7.l, v15.l +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.l, v3.l, v11.l +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.h, v6.l, v14.l ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.l, v10.l -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.h, v3.l, v11.l -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.l, v4.l, v12.l -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v8.l ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.h, v5.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, v12.l +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v3.l, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v8.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v2.l, v2.h +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.h ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v2.l -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll index 1d3b42ee43b0..aab0e76410cc 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll @@ -374,12 +374,13 @@ define i8 @test_vector_reduce_umin_v4i8(<4 x i8> %v) { ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v0.h, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -426,12 +427,13 @@ define i8 @test_vector_reduce_umin_v4i8(<4 x i8> %v) { ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v0.h, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -622,20 +624,22 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) { ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v7.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.h, v1.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l -; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.h, v0.h, v3.l, v3.h -; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v2.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v1.l, v1.l, v3.l, v3.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -699,20 +703,22 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) { ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v7.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.h, v1.l, v1.h -; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l -; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.h, v0.h, v3.l, v3.h -; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v2.l, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v1.l, v1.l, v3.l, v3.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1041,12 +1047,14 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) { ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v1.l, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1168,12 +1176,14 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v1.l, v0.l, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; From 8d256733a05ceeda8b854cc7665724c425236673 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Mon, 18 Aug 2025 13:07:05 -0500 Subject: [PATCH 14/27] [bazel] Port #151175: VectorFromElementsLowering (#154169) --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 763dbdbaee26..61c4673b6ac1 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -4920,6 +4920,7 @@ cc_library( ":MemRefDialect", ":Support", ":TensorDialect", + ":UBDialect", ":VectorDialect", "//llvm:Support", ], From 064f02dac0c81c19350a74415b3245f42fed09dc Mon Sep 17 00:00:00 2001 From: Kyle Wang Date: Mon, 18 Aug 2025 11:16:32 -0700 Subject: [PATCH 15/27] [VectorCombine] Preserve scoped alias metadata (#153714) Right now if a load op is scalarized, the `!alias.scope` and `!noalias` metadata are dropped. This PR is to keep them if exist. --- .../Transforms/Vectorize/VectorCombine.cpp | 16 ++++-- llvm/test/Transforms/VectorCombine/alias.ll | 56 +++++++++++++++++++ 2 files changed, 68 insertions(+), 4 deletions(-) create mode 100644 llvm/test/Transforms/VectorCombine/alias.ll diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 4e2a5c78e0ac..1275d53a075b 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1812,6 +1812,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { // erased in the correct order. Worklist.push(LI); + Type *ElemType = VecTy->getElementType(); + // Replace extracts with narrow scalar loads. for (User *U : LI->users()) { auto *EI = cast(U); @@ -1825,13 +1827,19 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { Builder.SetInsertPoint(EI); Value *GEP = Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx}); - auto *NewLoad = cast(Builder.CreateLoad( - VecTy->getElementType(), GEP, EI->getName() + ".scalar")); + auto *NewLoad = cast( + Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar")); - Align ScalarOpAlignment = computeAlignmentAfterScalarization( - LI->getAlign(), VecTy->getElementType(), Idx, *DL); + Align ScalarOpAlignment = + computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL); NewLoad->setAlignment(ScalarOpAlignment); + if (auto *ConstIdx = dyn_cast(Idx)) { + size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType); + AAMDNodes OldAAMD = LI->getAAMetadata(); + NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL)); + } + replaceValue(*EI, *NewLoad, false); } diff --git a/llvm/test/Transforms/VectorCombine/alias.ll b/llvm/test/Transforms/VectorCombine/alias.ll new file mode 100644 index 000000000000..459956cd997d --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/alias.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=vector-combine -S | FileCheck %s --check-prefixes=CHECK + +define <4 x i32> @quux(ptr addrspace(3) %arg) { +; CHECK-LABEL: define <4 x i32> @quux( +; CHECK-SAME: ptr addrspace(3) [[ARG:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = load i8, ptr addrspace(3) [[ARG]], align 4, !tbaa [[TBAA0:![0-9]+]], !alias.scope [[META0:![0-9]+]], !noalias [[META0]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(3) [[ARG]], i32 0, i64 1 +; CHECK-NEXT: [[EXTRACTELEMENT1:%.*]] = load i8, ptr addrspace(3) [[TMP0]], align 1, !tbaa [[TBAA0]], !alias.scope [[META0]], !noalias [[META0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(3) [[ARG]], i32 0, i64 2 +; CHECK-NEXT: [[EXTRACTELEMENT2:%.*]] = load i8, ptr addrspace(3) [[TMP1]], align 2, !tbaa [[TBAA0]], !alias.scope [[META0]], !noalias [[META0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(3) [[ARG]], i32 0, i64 3 +; CHECK-NEXT: [[EXTRACTELEMENT3:%.*]] = load i8, ptr addrspace(3) [[TMP2]], align 1, !tbaa [[TBAA0]], !alias.scope [[META0]], !noalias [[META0]] +; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[EXTRACTELEMENT]] to i32 +; CHECK-NEXT: [[ZEXT4:%.*]] = zext i8 [[EXTRACTELEMENT1]] to i32 +; CHECK-NEXT: [[ZEXT5:%.*]] = zext i8 [[EXTRACTELEMENT2]] to i32 +; CHECK-NEXT: [[ZEXT6:%.*]] = zext i8 [[EXTRACTELEMENT3]] to i32 +; CHECK-NEXT: [[INSERTELEMENT:%.*]] = insertelement <4 x i32> poison, i32 [[ZEXT]], i64 0 +; CHECK-NEXT: [[INSERTELEMENT7:%.*]] = insertelement <4 x i32> [[INSERTELEMENT]], i32 [[ZEXT4]], i64 1 +; CHECK-NEXT: [[INSERTELEMENT8:%.*]] = insertelement <4 x i32> [[INSERTELEMENT7]], i32 [[ZEXT5]], i64 2 +; CHECK-NEXT: [[INSERTELEMENT9:%.*]] = insertelement <4 x i32> [[INSERTELEMENT8]], i32 [[ZEXT6]], i64 3 +; CHECK-NEXT: ret <4 x i32> [[INSERTELEMENT9]] +; +bb: + %load = load <4 x i8>, ptr addrspace(3) %arg, align 4, !alias.scope !0, !noalias !0, !tbaa !5 + %extractelement = extractelement <4 x i8> %load, i64 0 + %extractelement1 = extractelement <4 x i8> %load, i64 1 + %extractelement2 = extractelement <4 x i8> %load, i64 2 + %extractelement3 = extractelement <4 x i8> %load, i64 3 + %zext = zext i8 %extractelement to i32 + %zext4 = zext i8 %extractelement1 to i32 + %zext5 = zext i8 %extractelement2 to i32 + %zext6 = zext i8 %extractelement3 to i32 + %insertelement = insertelement <4 x i32> poison, i32 %zext, i64 0 + %insertelement7 = insertelement <4 x i32> %insertelement, i32 %zext4, i64 1 + %insertelement8 = insertelement <4 x i32> %insertelement7, i32 %zext5, i64 2 + %insertelement9 = insertelement <4 x i32> %insertelement8, i32 %zext6, i64 3 + ret <4 x i32> %insertelement9 +} + +!0 = !{!1} +!1 = distinct !{!1, !2} +!2 = distinct !{!2} +!3 = !{!"Simple C/C++ TBAA"} +!4 = !{!"omnipotent char", !3, i64 0} +!5 = !{!"i8", !4, i64 0} +;. +; CHECK: [[TBAA0]] = !{[[META3:![0-9]+]], [[META3]], i64 0, i64 0} +; CHECK: [[META3]] = !{!"i8", [[META4:![0-9]+]]} +; CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0} +; CHECK: [[META5]] = !{!"Simple C/C++ TBAA"} +; CHECK: [[META0]] = !{[[META1:![0-9]+]]} +; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} +; CHECK: [[META2]] = distinct !{[[META2]]} +;. \ No newline at end of file From ade755d62b70eae9dfc460f19f0da7ab80e9a1fd Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Mon, 18 Aug 2025 11:31:15 -0700 Subject: [PATCH 16/27] [msan] Add Instrumentation for Avx512 Instructions: pmaddw, pmaddubs (#153919) This applies the pmadd handler (recently improved in https://github.com/llvm/llvm-project/pull/153353) to the Avx512 equivalent of the pmaddw and pmaddubs intrinsics: <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>) <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>) --- .../Instrumentation/MemorySanitizer.cpp | 18 +++ .../X86/avx512bw-intrinsics-upgrade.ll | 114 ++++++++++-------- .../X86/avx512bw-intrinsics.ll | 113 +++++++++-------- 3 files changed, 142 insertions(+), 103 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 6b394f533868..7865a9070740 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -5486,14 +5486,32 @@ struct MemorySanitizerVisitor : public InstVisitor { // Multiply and Add Packed Words // < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) // < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) + // <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>) // // Multiply and Add Packed Signed and Unsigned Bytes // < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) // <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) + // <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>) + // + // These intrinsics are auto-upgraded into non-masked forms: + // < 4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128 + // (<8 x i16>, <8 x i16>, <4 x i32>, i8) + // < 8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256 + // (<16 x i16>, <16 x i16>, <8 x i32>, i8) + // <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512 + // (<32 x i16>, <32 x i16>, <16 x i32>, i16) + // < 8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128 + // (<16 x i8>, <16 x i8>, <8 x i16>, i8) + // <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256 + // (<32 x i8>, <32 x i8>, <16 x i16>, i16) + // <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512 + // (<64 x i8>, <64 x i8>, <32 x i16>, i32) case Intrinsic::x86_sse2_pmadd_wd: case Intrinsic::x86_avx2_pmadd_wd: + case Intrinsic::x86_avx512_pmaddw_d_512: case Intrinsic::x86_ssse3_pmadd_ub_sw_128: case Intrinsic::x86_avx2_pmadd_ub_sw: + case Intrinsic::x86_avx512_pmaddubs_w_512: handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2); break; diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll index abbbb040edf1..51dad35a1edb 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll @@ -7,8 +7,6 @@ ; - llvm.x86.avx512.dbpsadbw.512 ; - llvm.x86.avx512.packssdw.512, llvm.x86.avx512.packsswb.512 ; - llvm.x86.avx512.packusdw.512, llvm.x86.avx512.packuswb.512 -; - llvm.x86.avx512.pmaddubs.w.512 -; - llvm.x86.avx512.pmaddw.d.512 ; ; Heuristically handled: ; - llvm.sadd.sat.v32i16, llvm.sadd.sat.v64i8 @@ -4931,18 +4929,21 @@ define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> % ; CHECK-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <64 x i8> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR7]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) -; CHECK-NEXT: store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = and <64 x i1> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = and <64 x i1> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = and <64 x i1> [[TMP3]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = or <64 x i1> [[TMP17]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = or <64 x i1> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = sext <64 x i1> [[TMP11]] to <64 x i8> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <64 x i8> [[TMP12]] to <32 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <32 x i16> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <32 x i1> [[TMP14]] to <32 x i16> +; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]]) +; CHECK-NEXT: store <32 x i16> [[TMP16]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <32 x i16> [[TMP7]] ; %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1) @@ -4956,22 +4957,25 @@ define <32 x i16> @test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <64 x i8> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR7]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = and <64 x i1> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP20:%.*]] = and <64 x i1> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP21:%.*]] = and <64 x i1> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP22:%.*]] = or <64 x i1> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or <64 x i1> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = sext <64 x i1> [[TMP23]] to <64 x i8> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <64 x i8> [[TMP24]] to <32 x i16> +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <32 x i16> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = sext <32 x i1> [[TMP25]] to <32 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]]) ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP3]] to <32 x i1> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32 [[X3:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP18]], <32 x i16> [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = xor <32 x i16> [[TMP9]], [[X2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <32 x i16> [[TMP13]], [[TMP18]] ; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i16> [[TMP14]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP15]], <32 x i16> [[TMP12]] ; CHECK-NEXT: [[TMP16:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP9]], <32 x i16> [[X2]] @@ -4989,18 +4993,21 @@ define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> % ; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR7]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = and <32 x i1> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = and <32 x i1> [[TMP3]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = or <32 x i1> [[TMP17]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = or <32 x i1> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i16> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <32 x i16> [[TMP12]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <16 x i1> [[TMP14]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]]) +; CHECK-NEXT: store <16 x i32> [[TMP16]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[TMP7]] ; %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1) @@ -5014,22 +5021,25 @@ define <16 x i32> @test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR7]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = and <32 x i1> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP20:%.*]] = and <32 x i1> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP21:%.*]] = and <32 x i1> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP22:%.*]] = or <32 x i1> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or <32 x i1> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = sext <32 x i1> [[TMP23]] to <32 x i16> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x i16> [[TMP24]] to <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <16 x i32> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = sext <16 x i1> [[TMP25]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]]) ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP18]], <16 x i32> [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP18]] ; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] ; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]] diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll index 00337da67af1..c6c7e002213b 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll @@ -9,7 +9,6 @@ ; - llvm.x86.avx512.mask.pmov.wb.mem.512 ; - llvm.x86.avx512.packssdw.512, llvm.x86.avx512.packsswb.512 ; - llvm.x86.avx512.packusdw.512, llvm.x86.avx512.packuswb.512 -; - llvm.x86.avx512.pmaddubs.w.512, llvm.x86.avx512.pmaddw.d.512 ; - llvm.x86.avx512.psad.bw.512 ; ; Heuristically handled: @@ -2206,18 +2205,21 @@ define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> % ; CHECK-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <64 x i8> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) -; CHECK-NEXT: store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = and <64 x i1> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = and <64 x i1> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = and <64 x i1> [[TMP3]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = or <64 x i1> [[TMP17]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = or <64 x i1> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = sext <64 x i1> [[TMP11]] to <64 x i8> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <64 x i8> [[TMP12]] to <32 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <32 x i16> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <32 x i1> [[TMP14]] to <32 x i16> +; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]]) +; CHECK-NEXT: store <32 x i16> [[TMP16]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <32 x i16> [[TMP7]] ; %1 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1) @@ -2231,22 +2233,25 @@ define <32 x i16> @test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <64 x i8> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <64 x i8> [[X0:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <64 x i8> [[X1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = and <64 x i1> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP20:%.*]] = and <64 x i1> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP21:%.*]] = and <64 x i1> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP22:%.*]] = or <64 x i1> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or <64 x i1> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = sext <64 x i1> [[TMP23]] to <64 x i8> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <64 x i8> [[TMP24]] to <32 x i16> +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <32 x i16> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = sext <32 x i1> [[TMP25]] to <32 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> [[X0]], <64 x i8> [[X1]]) ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP3]] to <32 x i1> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32 [[X3:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> zeroinitializer, <32 x i16> [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP18]], <32 x i16> [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = xor <32 x i16> [[TMP9]], [[X2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <32 x i16> [[TMP13]], [[TMP18]] ; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i16> [[TMP14]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP10]], <32 x i16> [[TMP15]], <32 x i16> [[TMP12]] ; CHECK-NEXT: [[TMP16:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP9]], <32 x i16> [[X2]] @@ -2266,18 +2271,21 @@ define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> % ; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = and <32 x i1> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = and <32 x i1> [[TMP3]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = or <32 x i1> [[TMP17]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = or <32 x i1> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = sext <32 x i1> [[TMP11]] to <32 x i16> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <32 x i16> [[TMP12]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <16 x i1> [[TMP14]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]]) +; CHECK-NEXT: store <16 x i32> [[TMP16]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[TMP7]] ; %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1) @@ -2291,22 +2299,25 @@ define <16 x i32> @test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <32 x i16> [[X0:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i16> [[X1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = and <32 x i1> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP20:%.*]] = and <32 x i1> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP21:%.*]] = and <32 x i1> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP22:%.*]] = or <32 x i1> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or <32 x i1> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = sext <32 x i1> [[TMP23]] to <32 x i16> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x i16> [[TMP24]] to <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <16 x i32> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = sext <16 x i1> [[TMP25]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> [[X0]], <32 x i16> [[X1]]) ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP18]], <16 x i32> [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP18]] ; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] ; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]] From 0fb1057e40110e558e0fef8e183e485c4d01311b Mon Sep 17 00:00:00 2001 From: Steven Perron Date: Mon, 18 Aug 2025 14:33:58 -0400 Subject: [PATCH 17/27] [SPIRV] Filter disallowed extensions for env (#150051) Not all SPIR-V extensions are allows in every environment. When we use the `-spirv-ext=all` option, the backend currently believes that all extensions can be used. This commit filters out the extensions on the command line to remove those that are not known to be allowed for the current environment. Alternatives considered: I considered modifying the SPIRVExtensionsParser::parse to use a different list of extensions for "all" depending on the target triple. However that does not work because the target triple is not available, and cannot be made available in a reasonable way. Fixes #147717 --------- Co-authored-by: Victor Lomuller --- .../SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp | 26 + .../Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h | 8 + llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp | 23 +- llvm/lib/Target/SPIRV/SPIRVCommandLine.h | 6 + llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp | 8 +- .../lib/Target/SPIRV/SPIRVSymbolicOperands.td | 452 +++++++++++------- .../enable-all-extensions-avoid-invalid.ll | 16 + 7 files changed, 366 insertions(+), 173 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-avoid-invalid.ll diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp index 0ed97f5b41c5..d6b607981047 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp @@ -38,8 +38,15 @@ struct CapabilityEntry { Capability::Capability ReqCapability; }; +struct EnvironmentEntry { + OperandCategory::OperandCategory Category; + uint32_t Value; + Environment::Environment AllowedEnvironment; +}; + using namespace OperandCategory; using namespace Extension; +using namespace Environment; using namespace Capability; using namespace InstructionSet; #define GET_SymbolicOperands_DECL @@ -48,6 +55,8 @@ using namespace InstructionSet; #define GET_ExtensionEntries_IMPL #define GET_CapabilityEntries_DECL #define GET_CapabilityEntries_IMPL +#define GET_EnvironmentEntries_DECL +#define GET_EnvironmentEntries_IMPL #define GET_ExtendedBuiltins_DECL #define GET_ExtendedBuiltins_IMPL #include "SPIRVGenTables.inc" @@ -133,6 +142,23 @@ getSymbolicOperandCapabilities(SPIRV::OperandCategory::OperandCategory Category, return Capabilities; } +EnvironmentList getSymbolicOperandAllowedEnvironments( + SPIRV::OperandCategory::OperandCategory Category, uint32_t Value) { + EnvironmentList Environments; + const SPIRV::EnvironmentEntry *Environment = + SPIRV::lookupEnvironmentByCategoryAndValue(Category, Value); + auto TableEnd = ArrayRef(SPIRV::EnvironmentEntries).end(); + while (Environment && Environment->Category == Category && + Environment->Value == Value) { + Environments.push_back(static_cast( + Environment->AllowedEnvironment)); + if (++Environment == TableEnd) + break; + } + + return Environments; +} + CapabilityList getCapabilitiesEnabledByExtension(SPIRV::Extension::Extension Extension) { const SPIRV::ExtensionEntry *Entry = diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h index b8c467fef8e8..c2c08f883130 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h @@ -37,6 +37,11 @@ namespace Capability { #include "SPIRVGenTables.inc" } // namespace Capability +namespace Environment { +#define GET_Environment_DECL +#include "SPIRVGenTables.inc" +} // namespace Environment + namespace SourceLanguage { #define GET_SourceLanguage_DECL #include "SPIRVGenTables.inc" @@ -241,6 +246,7 @@ enum InstFlags { using CapabilityList = SmallVector; using ExtensionList = SmallVector; +using EnvironmentList = SmallVector; std::string getSymbolicOperandMnemonic(SPIRV::OperandCategory::OperandCategory Category, @@ -254,6 +260,8 @@ getSymbolicOperandMaxVersion(SPIRV::OperandCategory::OperandCategory Category, CapabilityList getSymbolicOperandCapabilities(SPIRV::OperandCategory::OperandCategory Category, uint32_t Value); +EnvironmentList getSymbolicOperandAllowedEnvironments( + SPIRV::OperandCategory::OperandCategory Category, uint32_t Value); CapabilityList getCapabilitiesEnabledByExtension(SPIRV::Extension::Extension Extension); ExtensionList diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index d9265f498973..5a5860ac1c24 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -12,7 +12,8 @@ //===----------------------------------------------------------------------===// #include "SPIRVCommandLine.h" -#include "llvm/ADT/StringRef.h" +#include "MCTargetDesc/SPIRVBaseInfo.h" +#include "llvm/TargetParser/Triple.h" #include #include @@ -171,3 +172,23 @@ StringRef SPIRVExtensionsParser::checkExtensions( } return StringRef(); } + +std::set +SPIRVExtensionsParser::getValidExtensions(const Triple &TT) { + std::set R; + SPIRV::Environment::Environment CurrentEnvironment = + SPIRV::Environment::Environment::EnvOpenCL; + if (TT.getOS() == Triple::Vulkan) + CurrentEnvironment = SPIRV::Environment::Environment::EnvVulkan; + + for (const auto &[ExtensionName, ExtensionEnum] : SPIRVExtensionMap) { + EnvironmentList AllowedEnv = getSymbolicOperandAllowedEnvironments( + SPIRV::OperandCategory::OperandCategory::ExtensionOperand, + ExtensionEnum); + + if (std::count(AllowedEnv.begin(), AllowedEnv.end(), CurrentEnvironment)) + R.insert(ExtensionEnum); + } + + return R; +} diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.h b/llvm/lib/Target/SPIRV/SPIRVCommandLine.h index 3e3b22bde860..02e847b322a7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.h +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.h @@ -21,6 +21,7 @@ namespace llvm { class StringRef; +class Triple; /// Command line parser for toggling SPIR-V extensions. struct SPIRVExtensionsParser @@ -42,6 +43,11 @@ public: static StringRef checkExtensions(const std::vector &ExtNames, std::set &AllowedExtensions); + + /// Returns the list of extensions that are valid for a particular + /// target environment (i.e., OpenCL or Vulkan). + static std::set + getValidExtensions(const Triple &TT); }; } // namespace llvm diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp index cdf3c6224d4c..690493fb426b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp @@ -166,7 +166,13 @@ void SPIRVSubtarget::initAvailableExtInstSets() { void SPIRVSubtarget::initAvailableExtensions( const std::set &AllowedExtIds) { AvailableExtensions.clear(); - AvailableExtensions.insert_range(AllowedExtIds); + const std::set &ValidExtensions = + SPIRVExtensionsParser::getValidExtensions(TargetTriple); + + for (const auto &Ext : AllowedExtIds) { + if (ValidExtensions.count(Ext)) + AvailableExtensions.insert(Ext); + } accountForAMDShaderTrinaryMinmax(); } diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 614e83ae9b28..d2824ee2d2ca 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -109,23 +109,59 @@ def CapabilityEntries : GenericTable { let PrimaryKeyName = "lookupCapabilityByCategoryAndValue"; } +//===----------------------------------------------------------------------===// +// Lookup table for matching symbolic operands (category + 32-bit value) to +// SPIR-V environments. If an operand is allows in more than one environment, +// there will be multiple consecutive entries present in the table. +//===----------------------------------------------------------------------===// + +// Forward-declare classes used in ExtensionEntry +class Environment; + +class EnvironmentEntry value, + Environment allowedEnvironment> { + OperandCategory Category = category; + bits<32> Value = value; + Environment AllowedEnvironment = allowedEnvironment; +} + +def EnvironmentEntries : GenericTable { + let FilterClass = "EnvironmentEntry"; + let Fields = ["Category", "Value", "AllowedEnvironment"]; + string TypeOf_Category = "OperandCategory"; + string TypeOf_AllowedEnvironment = "Environment"; + let PrimaryKey = ["Category", "Value"]; + // Function for looking up a (the first) environment by category + value. Next + // environment should be consecutive. + let PrimaryKeyName = "lookupEnvironmentByCategoryAndValue"; +} + //===----------------------------------------------------------------------===// // Multiclass used to define a SymbolicOperand and at the same time declare // required extension and capabilities. //===----------------------------------------------------------------------===// -multiclass SymbolicOperandWithRequirements value, string mnemonic, bits<32> minVersion, bits<32> maxVersion, list reqExtensions, list reqCapabilities> { - assert !ge(!size(mnemonic), 1), "No mnemonic/string representation provided for symbolic operand with value " # value; - def : SymbolicOperand; +multiclass SymbolicOperandWithRequirements< + OperandCategory category, bits<32> value, string mnemonic, + bits<32> minVersion, bits<32> maxVersion, list reqExtensions, + list reqCapabilities, list allowedEnvironments> { + assert !ge(!size(mnemonic), 1), "No mnemonic/string representation provided " + "for symbolic operand with value "#value; + def : SymbolicOperand; - assert !le(!size(reqExtensions), 1), "Too many required extensions for a symbolic/named operand: " # mnemonic; - if !eq(!size(reqExtensions), 1) then { - def : ExtensionEntry; - } + assert !le(!size(reqExtensions), 1), + "Too many required extensions for a symbolic/named operand: "#mnemonic; + if !eq(!size(reqExtensions), 1) then { + def : ExtensionEntry; + } - foreach capability = reqCapabilities in { - def : CapabilityEntry; - } + foreach capability = reqCapabilities in { + def : CapabilityEntry; + } + + foreach environment = allowedEnvironments in { + def : EnvironmentEntry; + } } //===----------------------------------------------------------------------===// @@ -175,6 +211,20 @@ def CooperativeMatrixOperandsOperand : OperandCategory; def SpecConstantOpOperandsOperand : OperandCategory; def MatrixMultiplyAccumulateOperandsOperand : OperandCategory; +//===----------------------------------------------------------------------===// +// Definition of the Environments +//===----------------------------------------------------------------------===// + +def Environment : GenericEnum, Operand { + let FilterClass = "Environment"; + let ValueField = "Value"; +} + +class Environment value> { bits<32> Value = value; } + +def EnvOpenCL : Environment<0>; +def EnvVulkan : Environment<1>; + //===----------------------------------------------------------------------===// // Multiclass used to define Extesions enum values and at the same time // SymbolicOperand entries. @@ -192,135 +242,146 @@ class Extension value> { bits<32> Value = value; } -multiclass ExtensionOperand value> { +multiclass ExtensionOperand value, + list allowedEnvironments> { def NAME : Extension; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } -defm SPV_AMD_shader_explicit_vertex_parameter : ExtensionOperand<1>; -defm SPV_AMD_shader_trinary_minmax_extension : ExtensionOperand<2>; -defm SPV_AMD_gcn_shader : ExtensionOperand<3>; -defm SPV_KHR_shader_ballot : ExtensionOperand<4>; -defm SPV_AMD_shader_ballot : ExtensionOperand<5>; -defm SPV_AMD_gpu_shader_half_float : ExtensionOperand<6>; -defm SPV_KHR_shader_draw_parameters : ExtensionOperand<7>; -defm SPV_KHR_subgroup_vote : ExtensionOperand<8>; -defm SPV_KHR_16bit_storage : ExtensionOperand<9>; -defm SPV_KHR_device_group : ExtensionOperand<10>; -defm SPV_KHR_multiview : ExtensionOperand<11>; -defm SPV_NVX_multiview_per_view_attributes : ExtensionOperand<12>; -defm SPV_NV_viewport_array2 : ExtensionOperand<13>; -defm SPV_NV_stereo_view_rendering : ExtensionOperand<14>; -defm SPV_NV_sample_mask_override_coverage : ExtensionOperand<15>; -defm SPV_NV_geometry_shader_passthrough : ExtensionOperand<16>; -defm SPV_AMD_texture_gather_bias_lod : ExtensionOperand<17>; -defm SPV_KHR_storage_buffer_storage_class : ExtensionOperand<18>; -defm SPV_KHR_variable_pointers : ExtensionOperand<19>; -defm SPV_AMD_gpu_shader_int16 : ExtensionOperand<20>; -defm SPV_KHR_post_depth_coverage : ExtensionOperand<21>; -defm SPV_KHR_shader_atomic_counter_ops : ExtensionOperand<22>; -defm SPV_EXT_shader_stencil_export : ExtensionOperand<23>; -defm SPV_EXT_shader_viewport_index_layer : ExtensionOperand<24>; -defm SPV_AMD_shader_image_load_store_lod : ExtensionOperand<25>; -defm SPV_AMD_shader_fragment_mask : ExtensionOperand<26>; -defm SPV_EXT_fragment_fully_covered : ExtensionOperand<27>; -defm SPV_AMD_gpu_shader_half_float_fetch : ExtensionOperand<28>; -defm SPV_GOOGLE_decorate_string : ExtensionOperand<29>; -defm SPV_GOOGLE_hlsl_functionality1 : ExtensionOperand<30>; -defm SPV_NV_shader_subgroup_partitioned : ExtensionOperand<31>; -defm SPV_EXT_descriptor_indexing : ExtensionOperand<32>; -defm SPV_KHR_8bit_storage : ExtensionOperand<33>; -defm SPV_KHR_vulkan_memory_model : ExtensionOperand<34>; -defm SPV_NV_ray_tracing : ExtensionOperand<35>; -defm SPV_NV_compute_shader_derivatives : ExtensionOperand<36>; -defm SPV_NV_fragment_shader_barycentric : ExtensionOperand<37>; -defm SPV_NV_mesh_shader : ExtensionOperand<38>; -defm SPV_NV_shader_image_footprint : ExtensionOperand<39>; -defm SPV_NV_shading_rate : ExtensionOperand<40>; -defm SPV_INTEL_subgroups : ExtensionOperand<41>; -defm SPV_INTEL_media_block_io : ExtensionOperand<42>; -defm SPV_EXT_fragment_invocation_density : ExtensionOperand<44>; -defm SPV_KHR_no_integer_wrap_decoration : ExtensionOperand<45>; -defm SPV_KHR_float_controls : ExtensionOperand<46>; -defm SPV_EXT_physical_storage_buffer : ExtensionOperand<47>; -defm SPV_INTEL_fpga_memory_attributes : ExtensionOperand<48>; -defm SPV_NV_cooperative_matrix : ExtensionOperand<49>; -defm SPV_INTEL_shader_integer_functions2 : ExtensionOperand<50>; -defm SPV_INTEL_fpga_loop_controls : ExtensionOperand<51>; -defm SPV_EXT_fragment_shader_interlock : ExtensionOperand<52>; -defm SPV_NV_shader_sm_builtins : ExtensionOperand<53>; -defm SPV_KHR_shader_clock : ExtensionOperand<54>; -defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55>; -defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56>; -defm SPV_INTEL_fpga_reg : ExtensionOperand<57>; -defm SPV_INTEL_blocking_pipes : ExtensionOperand<58>; -defm SPV_GOOGLE_user_type : ExtensionOperand<59>; -defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60>; -defm SPV_INTEL_kernel_attributes : ExtensionOperand<61>; -defm SPV_KHR_non_semantic_info : ExtensionOperand<62>; -defm SPV_INTEL_io_pipes : ExtensionOperand<63>; -defm SPV_KHR_ray_tracing : ExtensionOperand<64>; -defm SPV_KHR_ray_query : ExtensionOperand<65>; -defm SPV_INTEL_fpga_memory_accesses : ExtensionOperand<66>; -defm SPV_INTEL_arbitrary_precision_integers : ExtensionOperand<67>; -defm SPV_EXT_shader_atomic_float_add : ExtensionOperand<68>; -defm SPV_KHR_terminate_invocation : ExtensionOperand<69>; -defm SPV_KHR_fragment_shading_rate : ExtensionOperand<70>; -defm SPV_EXT_shader_image_int64 : ExtensionOperand<71>; -defm SPV_INTEL_fp_fast_math_mode : ExtensionOperand<72>; -defm SPV_INTEL_fpga_cluster_attributes : ExtensionOperand<73>; -defm SPV_INTEL_loop_fuse : ExtensionOperand<74>; -defm SPV_EXT_shader_atomic_float_min_max : ExtensionOperand<75>; -defm SPV_KHR_workgroup_memory_explicit_layout : ExtensionOperand<76>; -defm SPV_KHR_linkonce_odr : ExtensionOperand<77>; -defm SPV_KHR_expect_assume : ExtensionOperand<78>; -defm SPV_INTEL_fpga_dsp_control : ExtensionOperand<79>; -defm SPV_NV_bindless_texture : ExtensionOperand<80>; -defm SPV_INTEL_fpga_invocation_pipelining_attributes : ExtensionOperand<81>; -defm SPV_KHR_subgroup_uniform_control_flow : ExtensionOperand<82>; -defm SPV_HUAWEI_subpass_shading : ExtensionOperand<83>; -defm SPV_KHR_integer_dot_product : ExtensionOperand<84>; -defm SPV_EXT_shader_atomic_float16_add : ExtensionOperand<85>; -defm SPV_INTEL_runtime_aligned : ExtensionOperand<86>; -defm SPV_KHR_bit_instructions : ExtensionOperand<87>; -defm SPV_NV_ray_tracing_motion_blur : ExtensionOperand<88>; -defm SPV_KHR_uniform_group_instructions : ExtensionOperand<89>; -defm SPV_KHR_subgroup_rotate : ExtensionOperand<90>; -defm SPV_INTEL_split_barrier : ExtensionOperand<91>; -defm SPV_KHR_ray_cull_mask : ExtensionOperand<92>; -defm SPV_KHR_fragment_shader_barycentric : ExtensionOperand<93>; -defm SPV_EXT_relaxed_printf_string_address_space : ExtensionOperand<94>; -defm SPV_EXT_ycbcr_attachments : ExtensionOperand<95>; -defm SPV_EXT_mesh_shader : ExtensionOperand<96>; -defm SPV_ARM_core_builtins : ExtensionOperand<97>; -defm SPV_EXT_opacity_micromap : ExtensionOperand<98>; -defm SPV_NV_shader_invocation_reorder : ExtensionOperand<99>; -defm SPV_INTEL_usm_storage_classes : ExtensionOperand<100>; -defm SPV_INTEL_fpga_latency_control : ExtensionOperand<101>; -defm SPV_INTEL_fpga_argument_interfaces : ExtensionOperand<102>; -defm SPV_INTEL_optnone : ExtensionOperand<103>; -defm SPV_INTEL_function_pointers : ExtensionOperand<104>; -defm SPV_INTEL_variable_length_array : ExtensionOperand<105>; -defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106>; -defm SPV_INTEL_inline_assembly : ExtensionOperand<107>; -defm SPV_INTEL_cache_controls : ExtensionOperand<108>; -defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109>; -defm SPV_INTEL_global_variable_fpga_decorations : ExtensionOperand<110>; -defm SPV_KHR_cooperative_matrix : ExtensionOperand<111>; -defm SPV_EXT_arithmetic_fence : ExtensionOperand<112>; -defm SPV_EXT_optnone : ExtensionOperand<113>; -defm SPV_INTEL_joint_matrix : ExtensionOperand<114>; -defm SPV_INTEL_float_controls2 : ExtensionOperand<115>; -defm SPV_INTEL_bindless_images : ExtensionOperand<116>; -defm SPV_INTEL_long_composites : ExtensionOperand<117>; -defm SPV_INTEL_memory_access_aliasing : ExtensionOperand<118>; -defm SPV_INTEL_fp_max_error : ExtensionOperand<119>; -defm SPV_INTEL_ternary_bitwise_function : ExtensionOperand<120>; -defm SPV_INTEL_subgroup_matrix_multiply_accumulate : ExtensionOperand<121>; -defm SPV_INTEL_2d_block_io : ExtensionOperand<122>; -defm SPV_INTEL_int4 : ExtensionOperand<123>; -defm SPV_KHR_float_controls2 : ExtensionOperand<124>; -defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125>; +defm SPV_AMD_shader_explicit_vertex_parameter + : ExtensionOperand<1, [EnvVulkan]>; +defm SPV_AMD_shader_trinary_minmax_extension : ExtensionOperand<2, [EnvVulkan]>; +defm SPV_AMD_gcn_shader : ExtensionOperand<3, [EnvVulkan]>; +defm SPV_KHR_shader_ballot : ExtensionOperand<4, [EnvVulkan]>; +defm SPV_AMD_shader_ballot : ExtensionOperand<5, [EnvVulkan]>; +defm SPV_AMD_gpu_shader_half_float : ExtensionOperand<6, [EnvVulkan]>; +defm SPV_KHR_shader_draw_parameters : ExtensionOperand<7, [EnvVulkan]>; +defm SPV_KHR_subgroup_vote : ExtensionOperand<8, [EnvVulkan]>; +defm SPV_KHR_16bit_storage : ExtensionOperand<9, [EnvVulkan]>; +defm SPV_KHR_device_group : ExtensionOperand<10, [EnvVulkan]>; +defm SPV_KHR_multiview : ExtensionOperand<11, [EnvVulkan]>; +defm SPV_NVX_multiview_per_view_attributes : ExtensionOperand<12, [EnvVulkan]>; +defm SPV_NV_viewport_array2 : ExtensionOperand<13, [EnvVulkan]>; +defm SPV_NV_stereo_view_rendering : ExtensionOperand<14, [EnvVulkan]>; +defm SPV_NV_sample_mask_override_coverage : ExtensionOperand<15, [EnvVulkan]>; +defm SPV_NV_geometry_shader_passthrough : ExtensionOperand<16, [EnvVulkan]>; +defm SPV_AMD_texture_gather_bias_lod : ExtensionOperand<17, [EnvVulkan]>; +defm SPV_KHR_storage_buffer_storage_class : ExtensionOperand<18, [EnvVulkan]>; +defm SPV_KHR_variable_pointers : ExtensionOperand<19, [EnvVulkan]>; +defm SPV_AMD_gpu_shader_int16 : ExtensionOperand<20, [EnvVulkan]>; +defm SPV_KHR_post_depth_coverage : ExtensionOperand<21, [EnvVulkan]>; +defm SPV_KHR_shader_atomic_counter_ops : ExtensionOperand<22, []>; +defm SPV_EXT_shader_stencil_export : ExtensionOperand<23, [EnvVulkan]>; +defm SPV_EXT_shader_viewport_index_layer : ExtensionOperand<24, [EnvVulkan]>; +defm SPV_AMD_shader_image_load_store_lod : ExtensionOperand<25, [EnvVulkan]>; +defm SPV_AMD_shader_fragment_mask : ExtensionOperand<26, [EnvVulkan]>; +defm SPV_EXT_fragment_fully_covered : ExtensionOperand<27, [EnvVulkan]>; +defm SPV_AMD_gpu_shader_half_float_fetch : ExtensionOperand<28, [EnvVulkan]>; +defm SPV_GOOGLE_decorate_string : ExtensionOperand<29, [EnvVulkan]>; +defm SPV_GOOGLE_hlsl_functionality1 : ExtensionOperand<30, [EnvVulkan]>; +defm SPV_NV_shader_subgroup_partitioned : ExtensionOperand<31, [EnvVulkan]>; +defm SPV_EXT_descriptor_indexing : ExtensionOperand<32, [EnvVulkan]>; +defm SPV_KHR_8bit_storage : ExtensionOperand<33, [EnvVulkan]>; +defm SPV_KHR_vulkan_memory_model : ExtensionOperand<34, [EnvVulkan]>; +defm SPV_NV_ray_tracing : ExtensionOperand<35, [EnvVulkan]>; +defm SPV_NV_compute_shader_derivatives : ExtensionOperand<36, [EnvVulkan]>; +defm SPV_NV_fragment_shader_barycentric : ExtensionOperand<37, [EnvVulkan]>; +defm SPV_NV_mesh_shader : ExtensionOperand<38, [EnvVulkan]>; +defm SPV_NV_shader_image_footprint : ExtensionOperand<39, [EnvVulkan]>; +defm SPV_NV_shading_rate : ExtensionOperand<40, [EnvVulkan]>; +defm SPV_INTEL_subgroups : ExtensionOperand<41, [EnvOpenCL]>; +defm SPV_INTEL_media_block_io : ExtensionOperand<42, [EnvOpenCL]>; +defm SPV_EXT_fragment_invocation_density : ExtensionOperand<44, [EnvVulkan]>; +defm SPV_KHR_no_integer_wrap_decoration : ExtensionOperand<45, [EnvOpenCL]>; +defm SPV_KHR_float_controls : ExtensionOperand<46, [EnvVulkan, EnvOpenCL]>; +defm SPV_EXT_physical_storage_buffer : ExtensionOperand<47, [EnvVulkan]>; +defm SPV_INTEL_fpga_memory_attributes : ExtensionOperand<48, [EnvOpenCL]>; +defm SPV_NV_cooperative_matrix : ExtensionOperand<49, [EnvVulkan]>; +defm SPV_INTEL_shader_integer_functions2 + : ExtensionOperand<50, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_fpga_loop_controls : ExtensionOperand<51, [EnvOpenCL]>; +defm SPV_EXT_fragment_shader_interlock : ExtensionOperand<52, [EnvVulkan]>; +defm SPV_NV_shader_sm_builtins : ExtensionOperand<53, [EnvVulkan]>; +defm SPV_KHR_shader_clock : ExtensionOperand<54, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55, [EnvOpenCL]>; +defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56, [EnvVulkan]>; +defm SPV_INTEL_fpga_reg : ExtensionOperand<57, [EnvOpenCL]>; +defm SPV_INTEL_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>; +defm SPV_GOOGLE_user_type : ExtensionOperand<59, [EnvVulkan]>; +defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60, [EnvVulkan]>; +defm SPV_INTEL_kernel_attributes : ExtensionOperand<61, [EnvOpenCL]>; +defm SPV_KHR_non_semantic_info : ExtensionOperand<62, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_io_pipes : ExtensionOperand<63, [EnvOpenCL]>; +defm SPV_KHR_ray_tracing : ExtensionOperand<64, [EnvVulkan]>; +defm SPV_KHR_ray_query : ExtensionOperand<65, [EnvVulkan]>; +defm SPV_INTEL_fpga_memory_accesses : ExtensionOperand<66, [EnvOpenCL]>; +defm SPV_INTEL_arbitrary_precision_integers : ExtensionOperand<67, [EnvOpenCL]>; +defm SPV_EXT_shader_atomic_float_add + : ExtensionOperand<68, [EnvVulkan, EnvOpenCL]>; +defm SPV_KHR_terminate_invocation : ExtensionOperand<69, [EnvVulkan]>; +defm SPV_KHR_fragment_shading_rate : ExtensionOperand<70, [EnvVulkan]>; +defm SPV_EXT_shader_image_int64 : ExtensionOperand<71, [EnvVulkan]>; +defm SPV_INTEL_fp_fast_math_mode : ExtensionOperand<72, [EnvOpenCL]>; +defm SPV_INTEL_fpga_cluster_attributes : ExtensionOperand<73, [EnvOpenCL]>; +defm SPV_INTEL_loop_fuse : ExtensionOperand<74, [EnvOpenCL]>; +defm SPV_EXT_shader_atomic_float_min_max + : ExtensionOperand<75, [EnvVulkan, EnvOpenCL]>; +defm SPV_KHR_workgroup_memory_explicit_layout + : ExtensionOperand<76, [EnvVulkan]>; +defm SPV_KHR_linkonce_odr : ExtensionOperand<77, [EnvOpenCL]>; +defm SPV_KHR_expect_assume : ExtensionOperand<78, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_fpga_dsp_control : ExtensionOperand<79, [EnvOpenCL]>; +defm SPV_NV_bindless_texture : ExtensionOperand<80, [EnvVulkan]>; +defm SPV_INTEL_fpga_invocation_pipelining_attributes + : ExtensionOperand<81, [EnvOpenCL]>; +defm SPV_KHR_subgroup_uniform_control_flow : ExtensionOperand<82, [EnvVulkan]>; +defm SPV_HUAWEI_subpass_shading : ExtensionOperand<83, [EnvVulkan]>; +defm SPV_KHR_integer_dot_product : ExtensionOperand<84, [EnvVulkan, EnvOpenCL]>; +defm SPV_EXT_shader_atomic_float16_add + : ExtensionOperand<85, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_runtime_aligned : ExtensionOperand<86, [EnvOpenCL]>; +defm SPV_KHR_bit_instructions : ExtensionOperand<87, [EnvOpenCL]>; +defm SPV_NV_ray_tracing_motion_blur : ExtensionOperand<88, [EnvVulkan]>; +defm SPV_KHR_uniform_group_instructions : ExtensionOperand<89, [EnvOpenCL]>; +defm SPV_KHR_subgroup_rotate : ExtensionOperand<90, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_split_barrier : ExtensionOperand<91, [EnvOpenCL]>; +defm SPV_KHR_ray_cull_mask : ExtensionOperand<92, [EnvVulkan]>; +defm SPV_KHR_fragment_shader_barycentric : ExtensionOperand<93, [EnvVulkan]>; +defm SPV_EXT_relaxed_printf_string_address_space + : ExtensionOperand<94, [EnvOpenCL]>; +defm SPV_EXT_mesh_shader : ExtensionOperand<96, [EnvVulkan]>; +defm SPV_ARM_core_builtins : ExtensionOperand<97, [EnvVulkan]>; +defm SPV_EXT_opacity_micromap : ExtensionOperand<98, [EnvVulkan]>; +defm SPV_NV_shader_invocation_reorder : ExtensionOperand<99, [EnvVulkan]>; +defm SPV_INTEL_usm_storage_classes : ExtensionOperand<100, [EnvOpenCL]>; +defm SPV_INTEL_fpga_latency_control : ExtensionOperand<101, [EnvOpenCL]>; +defm SPV_INTEL_fpga_argument_interfaces : ExtensionOperand<102, [EnvOpenCL]>; +defm SPV_INTEL_optnone : ExtensionOperand<103, [EnvOpenCL]>; +defm SPV_INTEL_function_pointers : ExtensionOperand<104, [EnvOpenCL]>; +defm SPV_INTEL_variable_length_array : ExtensionOperand<105, [EnvOpenCL]>; +defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106, [EnvOpenCL]>; +defm SPV_INTEL_inline_assembly : ExtensionOperand<107, [EnvOpenCL]>; +defm SPV_INTEL_cache_controls : ExtensionOperand<108, [EnvOpenCL]>; +defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109, [EnvOpenCL]>; +defm SPV_INTEL_global_variable_fpga_decorations + : ExtensionOperand<110, [EnvOpenCL]>; +defm SPV_KHR_cooperative_matrix : ExtensionOperand<111, [EnvVulkan, EnvOpenCL]>; +defm SPV_EXT_arithmetic_fence : ExtensionOperand<112, [EnvOpenCL]>; +defm SPV_EXT_optnone : ExtensionOperand<113, [EnvOpenCL]>; +defm SPV_INTEL_joint_matrix : ExtensionOperand<114, [EnvOpenCL]>; +defm SPV_INTEL_float_controls2 : ExtensionOperand<115, [EnvOpenCL]>; +defm SPV_INTEL_bindless_images : ExtensionOperand<116, [EnvOpenCL]>; +defm SPV_INTEL_long_composites : ExtensionOperand<117, [EnvOpenCL]>; +defm SPV_INTEL_memory_access_aliasing : ExtensionOperand<118, [EnvOpenCL]>; +defm SPV_INTEL_fp_max_error : ExtensionOperand<119, [EnvOpenCL]>; +defm SPV_INTEL_ternary_bitwise_function : ExtensionOperand<120, [EnvOpenCL]>; +defm SPV_INTEL_subgroup_matrix_multiply_accumulate + : ExtensionOperand<121, [EnvOpenCL]>; +defm SPV_INTEL_2d_block_io : ExtensionOperand<122, [EnvOpenCL]>; +defm SPV_INTEL_int4 : ExtensionOperand<123, [EnvOpenCL]>; +defm SPV_KHR_float_controls2 : ExtensionOperand<124, [EnvVulkan, EnvOpenCL]>; +defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>; //===----------------------------------------------------------------------===// // Multiclass used to define Capabilities enum values and at the same time @@ -342,7 +403,9 @@ class Capability value> { multiclass CapabilityOperand value, bits<32> minVersion, bits<32> maxVersion, list reqExtensions, list reqCapabilities> { def NAME : Capability; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm Matrix : CapabilityOperand<0, 0, 0, [], []>; @@ -551,7 +614,8 @@ class SourceLanguage value> { multiclass SourceLanguageOperand value> { def : SourceLanguage; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm Unknown : SourceLanguageOperand<0>; @@ -580,7 +644,8 @@ class AddressingModel value> { multiclass AddressingModelOperand value, list reqCapabilities> { def : AddressingModel; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm Logical : AddressingModelOperand<0, []>; @@ -607,7 +672,8 @@ class ExecutionModel value> { multiclass ExecutionModelOperand value, list reqCapabilities> { def : ExecutionModel; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm Vertex : ExecutionModelOperand<0, [Shader]>; @@ -645,7 +711,8 @@ class MemoryModel value> { multiclass MemoryModelOperand value, list reqCapabilities> { def : MemoryModel; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm Simple : MemoryModelOperand<0, [Shader]>; @@ -672,7 +739,8 @@ class ExecutionMode value> { multiclass ExecutionModeOperand value, list reqCapabilities> { def : ExecutionMode; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm Invocations : ExecutionModeOperand<0, [Geometry]>; @@ -748,7 +816,8 @@ class StorageClass value> { multiclass StorageClassOperand value, list reqExtensions, list reqCapabilities> { def : StorageClass; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm UniformConstant : StorageClassOperand<0, [], []>; @@ -794,7 +863,8 @@ class Dim value> { multiclass DimOperand value, string mnemonic, list reqCapabilities> { def NAME : Dim; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm DIM_1D : DimOperand<0, "1D", [Sampled1D, Image1D]>; @@ -824,7 +894,8 @@ class SamplerAddressingMode value> { multiclass SamplerAddressingModeOperand value, list reqCapabilities> { def : SamplerAddressingMode; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm None : SamplerAddressingModeOperand<0, [Kernel]>; @@ -852,7 +923,8 @@ class SamplerFilterMode value> { multiclass SamplerFilterModeOperand value, list reqCapabilities> { def : SamplerFilterMode; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm Nearest : SamplerFilterModeOperand<0, [Kernel]>; @@ -877,7 +949,8 @@ class ImageFormat value> { multiclass ImageFormatOperand value, list reqCapabilities> { def NAME : ImageFormat; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm Unknown : ImageFormatOperand<0, []>; @@ -940,7 +1013,8 @@ class ImageChannelOrder value> { multiclass ImageChannelOrderOperand value, list reqCapabilities> { def : ImageChannelOrder; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm R : ImageChannelOrderOperand<0, [Kernel]>; @@ -983,7 +1057,8 @@ class ImageChannelDataType value> { multiclass ImageChannelDataTypeOperand value, list reqCapabilities> { def : ImageChannelDataType; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm SnormInt8 : ImageChannelDataTypeOperand<0, []>; @@ -1023,7 +1098,8 @@ class ImageOperand value> { multiclass ImageOperandOperand value, list reqCapabilities> { def : ImageOperand; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm None : ImageOperandOperand<0x0, []>; @@ -1061,7 +1137,8 @@ class FPFastMathMode value> { multiclass FPFastMathModeOperand value, list reqCapabilities> { def : FPFastMathMode; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm None : FPFastMathModeOperand<0x0, []>; @@ -1090,7 +1167,8 @@ class FPRoundingMode value> { multiclass FPRoundingModeOperand value> { def NAME : FPRoundingMode; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm RTE : FPRoundingModeOperand<0>; @@ -1117,7 +1195,8 @@ class LinkageType value> { multiclass LinkageTypeOperand value, list reqCapabilities> { def : LinkageType; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm Export : LinkageTypeOperand<0, [Linkage]>; @@ -1143,7 +1222,8 @@ class AccessQualifier value> { multiclass AccessQualifierOperand value, list reqCapabilities> { def NAME : AccessQualifier; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm ReadOnly : AccessQualifierOperand<0, [Kernel]>; @@ -1170,7 +1250,9 @@ class FunctionParameterAttribute value> { multiclass FunctionParameterAttributeOperand value, list reqCapabilities> { def : FunctionParameterAttribute; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm Zext : FunctionParameterAttributeOperand<0, [Kernel]>; @@ -1202,7 +1284,9 @@ class Decoration value> { multiclass DecorationOperand value, bits<32> minVersion, bits<32> maxVersion, list reqExtensions, list reqCapabilities> { def : Decoration; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm RelaxedPrecision : DecorationOperand<0, 0, 0, [], [Shader]>; @@ -1303,7 +1387,9 @@ class BuiltIn value> { multiclass BuiltInOperand value, bits<32> minVersion, bits<32> maxVersion, list reqExtensions, list reqCapabilities> { def NAME : BuiltIn; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm Position : BuiltInOperand<0, 0, 0, [], [Shader]>; @@ -1417,7 +1503,8 @@ class SelectionControl value> { multiclass SelectionControlOperand value> { def : SelectionControl; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm None : SelectionControlOperand<0x0>; @@ -1443,7 +1530,8 @@ class LoopControl value> { multiclass LoopControlOperand value> { def : LoopControl; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm None : LoopControlOperand<0x0>; @@ -1476,7 +1564,8 @@ class FunctionControl value> { multiclass FunctionControlOperand value> { def : FunctionControl; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm None : FunctionControlOperand<0x0>; @@ -1506,7 +1595,9 @@ class MemorySemantics value> { multiclass MemorySemanticsOperand value, bits<32> minVersion, bits<32> maxVersion, list reqExtensions, list reqCapabilities> { def : MemorySemantics; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm None : MemorySemanticsOperand<0x0, 0, 0, [], []>; @@ -1544,7 +1635,9 @@ class MemoryOperand value> { multiclass MemoryOperandOperand value, bits<32> minVersion, bits<32> maxVersion, list reqExtensions, list reqCapabilities> { def : MemoryOperand; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm None : MemoryOperandOperand<0x0, 0, 0, [], []>; @@ -1577,7 +1670,9 @@ class Scope value> { multiclass ScopeOperand value, bits<32> minVersion, bits<32> maxVersion, list reqExtensions, list reqCapabilities> { def : Scope; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm CrossDevice : ScopeOperand<0, 0, 0, [], []>; @@ -1607,7 +1702,9 @@ class GroupOperation value> { multiclass GroupOperationOperand value, bits<32> minVersion, bits<32> maxVersion, list reqExtensions, list reqCapabilities> { def NAME : GroupOperation; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm Reduce : GroupOperationOperand<0, 0, 0, [], [Kernel, GroupNonUniformArithmetic, GroupNonUniformBallot]>; @@ -1638,7 +1735,9 @@ class KernelEnqueueFlags value> { multiclass KernelEnqueueFlagsOperand value, bits<32> minVersion, bits<32> maxVersion, list reqExtensions, list reqCapabilities> { def : KernelEnqueueFlags; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm NoWait : KernelEnqueueFlagsOperand<0, 0, 0, [], [Kernel]>; @@ -1665,7 +1764,9 @@ class KernelProfilingInfo value> { multiclass KernelProfilingInfoOperand value, bits<32> minVersion, bits<32> maxVersion, list reqExtensions, list reqCapabilities> { def : KernelProfilingInfo; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm None : KernelProfilingInfoOperand<0x0, 0, 0, [], []>; @@ -1690,7 +1791,8 @@ class Opcode value> { multiclass OpcodeOperand value> { def : Opcode; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } // TODO: implement other mnemonics. defm InBoundsAccessChain : OpcodeOperand<66>; @@ -1720,7 +1822,9 @@ class CooperativeMatrixLayout value> { multiclass CooperativeMatrixLayoutOperand value, list reqExtensions, list reqCapabilities> { def : CooperativeMatrixLayout; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm RowMajorKHR : CooperativeMatrixLayoutOperand<0x0, [SPV_KHR_cooperative_matrix], [CooperativeMatrixKHR]>; @@ -1747,7 +1851,9 @@ class CooperativeMatrixOperands value> { multiclass CooperativeMatrixOperandsOperand value, list reqExtensions, list reqCapabilities> { def : CooperativeMatrixOperands; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } defm NoneKHR : CooperativeMatrixOperandsOperand<0x0, [SPV_KHR_cooperative_matrix], [CooperativeMatrixKHR]>; @@ -1780,7 +1886,9 @@ class SpecConstantOpOperands value> { multiclass SpecConstantOpOperandsOperand value, list reqExtensions, list reqCapabilities> { def : SpecConstantOpOperands; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements; } // Conversion @@ -1868,7 +1976,9 @@ class MatrixMultiplyAccumulateOperands value> { multiclass MatrixMultiplyAccumulateOperandsOperand value, list reqExtensions> { def : MatrixMultiplyAccumulateOperands; - defm : SymbolicOperandWithRequirements; + defm : SymbolicOperandWithRequirements< + MatrixMultiplyAccumulateOperandsOperand, value, NAME, 0, 0, + reqExtensions, [], []>; } defm None : MatrixMultiplyAccumulateOperandsOperand<0x0, [SPV_INTEL_subgroup_matrix_multiply_accumulate]>; diff --git a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-avoid-invalid.ll b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-avoid-invalid.ll new file mode 100644 index 000000000000..2de7fff0bc90 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-avoid-invalid.ll @@ -0,0 +1,16 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv1.6-vulkan1.3-compute --spirv-ext=all %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-vulkan1.3-compute --spirv-ext=all %s -o - -filetype=obj | spirv-val --target-env vulkan1.3 %} + +; CHECK-NOT: OpExtension "SPV_KHR_no_integer_wrap_decoration" + +define internal void @foo(i32 %i) local_unnamed_addr { + %sub.i = sub nsw i32 0, %i + ret void +} + +define internal void @main() local_unnamed_addr #0 { +entry: + ret void +} + +attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } \ No newline at end of file From 8429f7faaa5c5afdece49be04bc5720d5110b6d1 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Mon, 18 Aug 2025 13:35:02 -0500 Subject: [PATCH 18/27] [flang][OpenMP] Parsing support for DYN_GROUPPRIVATE (#153615) This does not perform semantic checks or lowering. --- flang/include/flang/Lower/OpenMP/Clauses.h | 1 + flang/include/flang/Parser/dump-parse-tree.h | 4 ++ flang/include/flang/Parser/parse-tree.h | 14 +++- flang/lib/Lower/OpenMP/Clauses.cpp | 23 ++++++ flang/lib/Parser/openmp-parsers.cpp | 19 ++++- flang/lib/Parser/unparse.cpp | 6 ++ flang/lib/Semantics/check-omp-structure.cpp | 1 + .../OpenMP/Todo/dyn-groupprivate-clause.f90 | 10 +++ .../Parser/OpenMP/dyn-groupprivate-clause.f90 | 70 +++++++++++++++++++ llvm/include/llvm/Frontend/OpenMP/ClauseT.h | 22 ++++-- llvm/include/llvm/Frontend/OpenMP/OMP.td | 34 +++++++++ 11 files changed, 196 insertions(+), 8 deletions(-) create mode 100644 flang/test/Lower/OpenMP/Todo/dyn-groupprivate-clause.f90 create mode 100644 flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90 diff --git a/flang/include/flang/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h index 7f317f05f67b..1ab594ffcd20 100644 --- a/flang/include/flang/Lower/OpenMP/Clauses.h +++ b/flang/include/flang/Lower/OpenMP/Clauses.h @@ -219,6 +219,7 @@ using DistSchedule = tomp::clause::DistScheduleT; using Doacross = tomp::clause::DoacrossT; using DynamicAllocators = tomp::clause::DynamicAllocatorsT; +using DynGroupprivate = tomp::clause::DynGroupprivateT; using Enter = tomp::clause::EnterT; using Exclusive = tomp::clause::ExclusiveT; using Fail = tomp::clause::FailT; diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index 2c666a6d09a7..a4380e19cdba 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -525,6 +525,8 @@ public: NODE(parser, OmpAbsentClause) NODE(parser, OmpAffinityClause) NODE(OmpAffinityClause, Modifier) + NODE(parser, OmpAccessGroup) + NODE_ENUM(OmpAccessGroup, Value) NODE(parser, OmpAlignment) NODE(parser, OmpAlignClause) NODE(parser, OmpAlignedClause) @@ -569,6 +571,8 @@ public: NODE_ENUM(OmpDependenceType, Value) NODE(parser, OmpTaskDependenceType) NODE_ENUM(OmpTaskDependenceType, Value) + NODE(parser, OmpDynGroupprivateClause) + NODE(OmpDynGroupprivateClause, Modifier) NODE(parser, OmpIndirectClause) NODE(parser, OmpIterationOffset) NODE(parser, OmpIteration) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index e72190f019dd..e9045b4f772e 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -3736,6 +3736,11 @@ inline namespace modifier { // ENUM_CLASS(Value, Keyword1, Keyword2); // }; +struct OmpAccessGroup { + ENUM_CLASS(Value, Cgroup); + WRAPPER_CLASS_BOILERPLATE(OmpAccessGroup, Value); +}; + // Ref: [4.5:72-81], [5.0:110-119], [5.1:134-143], [5.2:169-170] // // alignment -> @@ -4019,8 +4024,9 @@ struct OmpOrderModifier { // // prescriptiveness -> // STRICT // since 5.1 +// FALLBACK // since 6.1 struct OmpPrescriptiveness { - ENUM_CLASS(Value, Strict) + ENUM_CLASS(Value, Strict, Fallback) WRAPPER_CLASS_BOILERPLATE(OmpPrescriptiveness, Value); }; @@ -4375,6 +4381,12 @@ struct OmpDeviceTypeClause { WRAPPER_CLASS_BOILERPLATE(OmpDeviceTypeClause, DeviceTypeDescription); }; +struct OmpDynGroupprivateClause { + TUPLE_CLASS_BOILERPLATE(OmpDynGroupprivateClause); + MODIFIER_BOILERPLATE(OmpAccessGroup, OmpPrescriptiveness); + std::tuple t; +}; + // Ref: [5.2:158-159], [6.0:289-290] // // enter-clause -> diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp index 7f75aae09def..1a16e1c87e25 100644 --- a/flang/lib/Lower/OpenMP/Clauses.cpp +++ b/flang/lib/Lower/OpenMP/Clauses.cpp @@ -396,6 +396,8 @@ makePrescriptiveness(parser::OmpPrescriptiveness::Value v) { switch (v) { case parser::OmpPrescriptiveness::Value::Strict: return clause::Prescriptiveness::Strict; + case parser::OmpPrescriptiveness::Value::Fallback: + return clause::Prescriptiveness::Fallback; } llvm_unreachable("Unexpected prescriptiveness"); } @@ -770,6 +772,27 @@ Doacross make(const parser::OmpClause::Doacross &inp, // DynamicAllocators: empty +DynGroupprivate make(const parser::OmpClause::DynGroupprivate &inp, + semantics::SemanticsContext &semaCtx) { + // imp.v -> OmpDyngroupprivateClause + CLAUSET_ENUM_CONVERT( // + convert, parser::OmpAccessGroup::Value, DynGroupprivate::AccessGroup, + // clang-format off + MS(Cgroup, Cgroup) + // clang-format on + ); + + auto &mods = semantics::OmpGetModifiers(inp.v); + auto *m0 = semantics::OmpGetUniqueModifier(mods); + auto *m1 = semantics::OmpGetUniqueModifier(mods); + auto &size = std::get(inp.v.t); + + return DynGroupprivate{ + {/*AccessGroup=*/maybeApplyToV(convert, m0), + /*Prescriptiveness=*/maybeApplyToV(makePrescriptiveness, m1), + /*Size=*/makeExpr(size, semaCtx)}}; +} + Enter make(const parser::OmpClause::Enter &inp, semantics::SemanticsContext &semaCtx) { // inp.v -> parser::OmpEnterClause diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 46b14861096f..d83635952740 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -469,6 +469,9 @@ TYPE_PARSER(sourced(construct( // --- Parsers for clause modifiers ----------------------------------- +TYPE_PARSER(construct( // + "CGROUP" >> pure(OmpAccessGroup::Value::Cgroup))) + TYPE_PARSER(construct(scalarIntExpr)) TYPE_PARSER(construct( // @@ -573,7 +576,8 @@ TYPE_PARSER(construct( "SIMD" >> pure(OmpOrderingModifier::Value::Simd))) TYPE_PARSER(construct( - "STRICT" >> pure(OmpPrescriptiveness::Value::Strict))) + "STRICT" >> pure(OmpPrescriptiveness::Value::Strict) || + "FALLBACK" >> pure(OmpPrescriptiveness::Value::Fallback))) TYPE_PARSER(construct( // "PRESENT" >> pure(OmpPresentModifier::Value::Present))) @@ -636,6 +640,12 @@ TYPE_PARSER(sourced(construct(sourced( construct( Parser{}))))) +TYPE_PARSER( // + sourced(construct( + Parser{})) || + sourced(construct( + Parser{}))) + TYPE_PARSER( sourced(construct(Parser{}))) @@ -777,6 +787,10 @@ TYPE_PARSER(construct( Parser{}) || construct(indirect(Parser{})))) +TYPE_PARSER(construct( + maybe(nonemptyList(Parser{}) / ":"), + scalarIntExpr)) + TYPE_PARSER(construct( maybe(nonemptyList(Parser{}) / ":"), Parser{})) @@ -1068,6 +1082,9 @@ TYPE_PARSER( // construct(parenthesized(Parser{})) || "DYNAMIC_ALLOCATORS" >> construct(construct()) || + "DYN_GROUPPRIVATE" >> + construct(construct( + parenthesized(Parser{}))) || "ENTER" >> construct(construct( parenthesized(Parser{}))) || "EXCLUSIVE" >> construct(construct( diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index 4f8d49897280..f3b82975a837 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2250,6 +2250,11 @@ public: Walk(std::get(x.t)); Walk(": ", std::get>>(x.t)); } + void Unparse(const OmpDynGroupprivateClause &x) { + using Modifier = OmpDynGroupprivateClause::Modifier; + Walk(std::get>>(x.t), ": "); + Walk(std::get(x.t)); + } void Unparse(const OmpEnterClause &x) { using Modifier = OmpEnterClause::Modifier; Walk(std::get>>(x.t), ": "); @@ -2941,6 +2946,7 @@ public: WALK_NESTED_ENUM(OmpTaskDependenceType, Value) // OMP task-dependence-type WALK_NESTED_ENUM(OmpScheduleClause, Kind) // OMP schedule-kind WALK_NESTED_ENUM(OmpSeverityClause, Severity) // OMP severity + WALK_NESTED_ENUM(OmpAccessGroup, Value) WALK_NESTED_ENUM(OmpDeviceModifier, Value) // OMP device modifier WALK_NESTED_ENUM( OmpDeviceTypeClause, DeviceTypeDescription) // OMP device_type diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index bf126bbb0d8c..d9092565449d 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -2581,6 +2581,7 @@ CHECK_SIMPLE_CLAUSE(Default, OMPC_default) CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj) CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type) CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule) +CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate) CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive) CHECK_SIMPLE_CLAUSE(Final, OMPC_final) CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush) diff --git a/flang/test/Lower/OpenMP/Todo/dyn-groupprivate-clause.f90 b/flang/test/Lower/OpenMP/Todo/dyn-groupprivate-clause.f90 new file mode 100644 index 000000000000..e06470f772bf --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/dyn-groupprivate-clause.f90 @@ -0,0 +1,10 @@ +!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=61 -o - %s 2>&1 | FileCheck %s + +!CHECK: not yet implemented: DYN_GROUPPRIVATE clause is not implemented yet +subroutine f00(n) + implicit none + integer :: n + !$omp target dyn_groupprivate(n) + !$omp end target +end + diff --git a/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90 b/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90 new file mode 100644 index 000000000000..7d41efd348e5 --- /dev/null +++ b/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90 @@ -0,0 +1,70 @@ +!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=61 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s +!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=61 %s | FileCheck --check-prefix="PARSE-TREE" %s + +subroutine f00(n) + implicit none + integer :: n + !$omp target dyn_groupprivate(n) + !$omp end target +end + +!UNPARSE: SUBROUTINE f00 (n) +!UNPARSE: IMPLICIT NONE +!UNPARSE: INTEGER n +!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(n) +!UNPARSE: !$OMP END TARGET +!UNPARSE: END SUBROUTINE + +!PARSE-TREE: OmpBeginDirective +!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target +!PARSE-TREE: | OmpClauseList -> OmpClause -> DynGroupprivate -> OmpDynGroupprivateClause +!PARSE-TREE: | | Scalar -> Integer -> Expr = 'n' +!PARSE-TREE: | | | Designator -> DataRef -> Name = 'n' +!PARSE-TREE: | Flags = None + + +subroutine f01(n) + implicit none + integer :: n + !$omp target dyn_groupprivate(strict: n) + !$omp end target +end + +!UNPARSE: SUBROUTINE f01 (n) +!UNPARSE: IMPLICIT NONE +!UNPARSE: INTEGER n +!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(STRICT: n) +!UNPARSE: !$OMP END TARGET +!UNPARSE: END SUBROUTINE + +!PARSE-TREE: OmpBeginDirective +!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target +!PARSE-TREE: | OmpClauseList -> OmpClause -> DynGroupprivate -> OmpDynGroupprivateClause +!PARSE-TREE: | | Modifier -> OmpPrescriptiveness -> Value = Strict +!PARSE-TREE: | | Scalar -> Integer -> Expr = 'n' +!PARSE-TREE: | | | Designator -> DataRef -> Name = 'n' +!PARSE-TREE: | Flags = None + + +subroutine f02(n) + implicit none + integer :: n + !$omp target dyn_groupprivate(fallback, cgroup: n) + !$omp end target +end + +!UNPARSE: SUBROUTINE f02 (n) +!UNPARSE: IMPLICIT NONE +!UNPARSE: INTEGER n +!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(FALLBACK, CGROUP: n) +!UNPARSE: !$OMP END TARGET +!UNPARSE: END SUBROUTINE + +!PARSE-TREE: OmpBeginDirective +!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target +!PARSE-TREE: | OmpClauseList -> OmpClause -> DynGroupprivate -> OmpDynGroupprivateClause +!PARSE-TREE: | | Modifier -> OmpPrescriptiveness -> Value = Fallback +!PARSE-TREE: | | Modifier -> OmpAccessGroup -> Value = Cgroup +!PARSE-TREE: | | Scalar -> Integer -> Expr = 'n' +!PARSE-TREE: | | | Designator -> DataRef -> Name = 'n' +!PARSE-TREE: | Flags = None diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h index ce1cedc188fb..8ea50e7e8d41 100644 --- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h +++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h @@ -242,7 +242,7 @@ ENUM(MotionExpectation, Present); // V5.2: [15.9.1] `task-dependence-type` modifier ENUM(DependenceType, Depobj, In, Inout, Inoutset, Mutexinoutset, Out, Sink, Source); -ENUM(Prescriptiveness, Strict); +ENUM(Prescriptiveness, Strict, Fallback); template // struct LoopIterationT { @@ -574,6 +574,15 @@ struct DynamicAllocatorsT { using EmptyTrait = std::true_type; }; +template // +struct DynGroupprivateT { + ENUM(AccessGroup, Cgroup); + using Prescriptiveness = type::Prescriptiveness; + using Size = E; + using TupleTrait = std::true_type; + std::tuple t; +}; + // V5.2: [5.8.4] `enter` clause template // struct EnterT { @@ -1263,11 +1272,12 @@ template using TupleClausesT = std::variant, AlignedT, AllocateT, DefaultmapT, DeviceT, DistScheduleT, - DoacrossT, FromT, GrainsizeT, - IfT, InitT, InReductionT, - LastprivateT, LinearT, MapT, - NumTasksT, OrderT, ReductionT, - ScheduleT, TaskReductionT, ToT>; + DoacrossT, DynGroupprivateT, FromT, + GrainsizeT, IfT, InitT, + InReductionT, LastprivateT, LinearT, + MapT, NumTasksT, OrderT, + ReductionT, ScheduleT, + TaskReductionT, ToT>; template using UnionClausesT = std::variant>; diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td index 79f25bb05f20..7140980e6353 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -178,6 +178,9 @@ def OMPC_Doacross : Clause<[Spelling<"doacross">]> { def OMPC_DynamicAllocators : Clause<[Spelling<"dynamic_allocators">]> { let clangClass = "OMPDynamicAllocatorsClause"; } +def OMPC_DynGroupprivate : Clause<[Spelling<"dyn_groupprivate">]> { + let flangClass = "OmpDynGroupprivateClause"; +} def OMPC_Enter : Clause<[Spelling<"enter">]> { let flangClass = "OmpEnterClause"; } @@ -1104,6 +1107,7 @@ def OMP_Target : Directive<[Spelling<"target">]> { let allowedOnceClauses = [ VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -1254,6 +1258,7 @@ def OMP_Teams : Directive<[Spelling<"teams">]> { ]; let allowedOnceClauses = [ VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -1522,6 +1527,7 @@ def OMP_target_loop : Directive<[Spelling<"target loop">]> { let allowedOnceClauses = [ VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -1983,6 +1989,7 @@ def OMP_TargetParallel : Directive<[Spelling<"target parallel">]> { let allowedOnceClauses = [ VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2012,6 +2019,7 @@ def OMP_TargetParallelDo : Directive<[Spelling<"target parallel do">]> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2054,6 +2062,9 @@ def OMP_TargetParallelDoSimd VersionedClause, VersionedClause, ]; + let allowedOnceClauses = [ + VersionedClause, + ]; let leafConstructs = [OMP_Target, OMP_Parallel, OMP_Do, OMP_Simd]; let category = CA_Executable; let languages = [L_Fortran]; @@ -2086,6 +2097,7 @@ def OMP_TargetParallelFor : Directive<[Spelling<"target parallel for">]> { VersionedClause, ]; let allowedOnceClauses = [ + VersionedClause, VersionedClause, VersionedClause, ]; @@ -2126,6 +2138,7 @@ def OMP_TargetParallelForSimd VersionedClause, ]; let allowedOnceClauses = [ + VersionedClause, VersionedClause, VersionedClause, ]; @@ -2155,6 +2168,7 @@ def OMP_target_parallel_loop : Directive<[Spelling<"target parallel loop">]> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2189,6 +2203,7 @@ def OMP_TargetSimd : Directive<[Spelling<"target simd">]> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2220,6 +2235,7 @@ def OMP_TargetTeams : Directive<[Spelling<"target teams">]> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2252,6 +2268,7 @@ def OMP_TargetTeamsDistribute VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2284,6 +2301,7 @@ def OMP_TargetTeamsDistributeParallelDo VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2322,6 +2340,7 @@ def OMP_TargetTeamsDistributeParallelDoSimd VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2367,6 +2386,7 @@ def OMP_TargetTeamsDistributeParallelFor VersionedClause, ]; let allowedOnceClauses = [ + VersionedClause, VersionedClause, ]; let leafConstructs = @@ -2409,6 +2429,7 @@ def OMP_TargetTeamsDistributeParallelForSimd VersionedClause, ]; let allowedOnceClauses = [ + VersionedClause, VersionedClause, ]; let leafConstructs = @@ -2441,6 +2462,7 @@ def OMP_TargetTeamsDistributeSimd VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2474,6 +2496,7 @@ def OMP_target_teams_loop : Directive<[Spelling<"target teams loop">]> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2532,6 +2555,7 @@ def OMP_TeamsDistribute : Directive<[Spelling<"teams distribute">]> { VersionedClause, ]; let allowedOnceClauses = [ + VersionedClause, VersionedClause, VersionedClause, ]; @@ -2555,6 +2579,7 @@ def OMP_TeamsDistributeParallelDo VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2584,6 +2609,7 @@ def OMP_TeamsDistributeParallelDoSimd VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2620,6 +2646,9 @@ def OMP_TeamsDistributeParallelFor VersionedClause, VersionedClause, ]; + let allowedOnceClauses = [ + VersionedClause, + ]; let leafConstructs = [OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For]; let category = CA_Executable; let languages = [L_C]; @@ -2650,6 +2679,9 @@ def OMP_TeamsDistributeParallelForSimd VersionedClause, VersionedClause, ]; + let allowedOnceClauses = [ + VersionedClause, + ]; let leafConstructs = [OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For, OMP_Simd]; let category = CA_Executable; @@ -2673,6 +2705,7 @@ def OMP_TeamsDistributeSimd : Directive<[Spelling<"teams distribute simd">]> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2696,6 +2729,7 @@ def OMP_teams_loop : Directive<[Spelling<"teams loop">]> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, From 43df97a909fbb0ebc8416b9faa88de21447fc3fe Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Mon, 18 Aug 2025 11:55:23 -0700 Subject: [PATCH 19/27] llvm-profgen: Avoid "using namespace" in headers (#147631) Avoid global `using namespace` directives in headers as they are bad style. --- llvm/tools/llvm-profgen/CSPreInliner.h | 3 -- llvm/tools/llvm-profgen/ErrorHandling.h | 4 ++- llvm/tools/llvm-profgen/PerfReader.cpp | 5 ++- llvm/tools/llvm-profgen/PerfReader.h | 3 -- llvm/tools/llvm-profgen/ProfileGenerator.cpp | 6 ++-- llvm/tools/llvm-profgen/ProfileGenerator.h | 3 -- llvm/tools/llvm-profgen/ProfiledBinary.cpp | 1 + llvm/tools/llvm-profgen/ProfiledBinary.h | 34 +++++++++----------- llvm/tools/llvm-profgen/llvm-profgen.cpp | 6 ++-- 9 files changed, 27 insertions(+), 38 deletions(-) diff --git a/llvm/tools/llvm-profgen/CSPreInliner.h b/llvm/tools/llvm-profgen/CSPreInliner.h index 8a3f16a4f13c..022c3f8d0dae 100644 --- a/llvm/tools/llvm-profgen/CSPreInliner.h +++ b/llvm/tools/llvm-profgen/CSPreInliner.h @@ -16,9 +16,6 @@ #include "llvm/Transforms/IPO/ProfiledCallGraph.h" #include "llvm/Transforms/IPO/SampleContextTracker.h" -using namespace llvm; -using namespace sampleprof; - namespace llvm { namespace sampleprof { diff --git a/llvm/tools/llvm-profgen/ErrorHandling.h b/llvm/tools/llvm-profgen/ErrorHandling.h index b797add8a892..17084bd785e6 100644 --- a/llvm/tools/llvm-profgen/ErrorHandling.h +++ b/llvm/tools/llvm-profgen/ErrorHandling.h @@ -16,7 +16,7 @@ #include "llvm/Support/WithColor.h" #include -using namespace llvm; +namespace llvm { [[noreturn]] inline void exitWithError(const Twine &Message, StringRef Whence = StringRef(), @@ -53,4 +53,6 @@ inline void emitWarningSummary(uint64_t Num, uint64_t Total, StringRef Msg) { << "%(" << Num << "/" << Total << ") " << Msg << "\n"; } +} // end namespace llvm + #endif diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp index ad113eda2791..4ab5f2e63fd1 100644 --- a/llvm/tools/llvm-profgen/PerfReader.cpp +++ b/llvm/tools/llvm-profgen/PerfReader.cpp @@ -15,6 +15,8 @@ #define DEBUG_TYPE "perf-reader" +using namespace llvm; + cl::opt SkipSymbolization("skip-symbolization", cl::desc("Dump the unsymbolized profile to the " "output file. It will show unwinder " @@ -47,9 +49,6 @@ static cl::opt CSProfMaxUnsymbolizedCtxDepth( cl::desc("Keep the last K contexts while merging unsymbolized profile. -1 " "means no depth limit.")); -extern cl::opt PerfTraceFilename; -extern cl::opt ShowDisassemblyOnly; -extern cl::opt ShowSourceLocations; extern cl::opt OutputFilename; namespace llvm { diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h index 4b3ac8f56975..19451915812e 100644 --- a/llvm/tools/llvm-profgen/PerfReader.h +++ b/llvm/tools/llvm-profgen/PerfReader.h @@ -17,9 +17,6 @@ #include #include -using namespace llvm; -using namespace sampleprof; - namespace llvm { class CleanupInstaller; diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp index db686c3b597e..9468228acc42 100644 --- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp +++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp @@ -17,6 +17,9 @@ #include #include +using namespace llvm; +using namespace sampleprof; + cl::opt OutputFilename("output", cl::value_desc("output"), cl::Required, cl::desc("Output profile file")); @@ -104,9 +107,6 @@ cl::opt InferMissingFrames( "Infer missing call frames due to compiler tail call elimination."), llvm::cl::Optional); -using namespace llvm; -using namespace sampleprof; - namespace llvm { namespace sampleprof { diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h index 5e36128530cd..d3e04563a81c 100644 --- a/llvm/tools/llvm-profgen/ProfileGenerator.h +++ b/llvm/tools/llvm-profgen/ProfileGenerator.h @@ -17,9 +17,6 @@ #include #include -using namespace llvm; -using namespace sampleprof; - namespace llvm { namespace sampleprof { diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp index 6847ba1b21b1..beef4338d5f8 100644 --- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp +++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp @@ -25,6 +25,7 @@ #define DEBUG_TYPE "load-binary" using namespace llvm; +using namespace llvm::object; using namespace sampleprof; cl::opt ShowDisassemblyOnly("show-disassembly-only", diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h index 0588cb48b2af..5b35c040b2c4 100644 --- a/llvm/tools/llvm-profgen/ProfiledBinary.h +++ b/llvm/tools/llvm-profgen/ProfiledBinary.h @@ -42,15 +42,10 @@ #include namespace llvm { + extern cl::opt EnableCSPreInliner; extern cl::opt UseContextCostForPreInliner; -} // namespace llvm -using namespace llvm; -using namespace sampleprof; -using namespace llvm::object; - -namespace llvm { namespace sampleprof { class ProfiledBinary; @@ -303,34 +298,34 @@ class ProfiledBinary { bool IsCOFF = false; - void setPreferredTextSegmentAddresses(const ObjectFile *O); + void setPreferredTextSegmentAddresses(const object::ObjectFile *O); template - void setPreferredTextSegmentAddresses(const ELFFile &Obj, + void setPreferredTextSegmentAddresses(const object::ELFFile &Obj, StringRef FileName); - void setPreferredTextSegmentAddresses(const COFFObjectFile *Obj, + void setPreferredTextSegmentAddresses(const object::COFFObjectFile *Obj, StringRef FileName); - void checkPseudoProbe(const ELFObjectFileBase *Obj); + void checkPseudoProbe(const object::ELFObjectFileBase *Obj); - void decodePseudoProbe(const ELFObjectFileBase *Obj); + void decodePseudoProbe(const object::ELFObjectFileBase *Obj); - void - checkUseFSDiscriminator(const ObjectFile *Obj, - std::map &AllSymbols); + void checkUseFSDiscriminator( + const object::ObjectFile *Obj, + std::map &AllSymbols); // Set up disassembler and related components. - void setUpDisassembler(const ObjectFile *Obj); + void setUpDisassembler(const object::ObjectFile *Obj); symbolize::LLVMSymbolizer::Options getSymbolizerOpts() const; // Load debug info of subprograms from DWARF section. - void loadSymbolsFromDWARF(ObjectFile &Obj); + void loadSymbolsFromDWARF(object::ObjectFile &Obj); // Load debug info from DWARF unit. void loadSymbolsFromDWARFUnit(DWARFUnit &CompilationUnit); // Create elf symbol to its start address mapping. - void populateElfSymbolAddressList(const ELFObjectFileBase *O); + void populateElfSymbolAddressList(const object::ELFObjectFileBase *O); // A function may be spilt into multiple non-continuous address ranges. We use // this to set whether start a function range is the real entry of the @@ -341,11 +336,12 @@ class ProfiledBinary { void warnNoFuncEntry(); /// Dissassemble the text section and build various address maps. - void disassemble(const ObjectFile *O); + void disassemble(const object::ObjectFile *O); /// Helper function to dissassemble the symbol and extract info for unwinding bool dissassembleSymbol(std::size_t SI, ArrayRef Bytes, - SectionSymbolsTy &Symbols, const SectionRef &Section); + SectionSymbolsTy &Symbols, + const object::SectionRef &Section); /// Symbolize a given instruction pointer and return a full call context. SampleContextFrameVector symbolize(const InstructionPointer &IP, bool UseCanonicalFnName = false, diff --git a/llvm/tools/llvm-profgen/llvm-profgen.cpp b/llvm/tools/llvm-profgen/llvm-profgen.cpp index 3b974e25103a..5464888e77ad 100644 --- a/llvm/tools/llvm-profgen/llvm-profgen.cpp +++ b/llvm/tools/llvm-profgen/llvm-profgen.cpp @@ -21,6 +21,9 @@ #include "llvm/Support/TargetSelect.h" #include "llvm/Support/VirtualFileSystem.h" +using namespace llvm; +using namespace sampleprof; + static cl::OptionCategory ProfGenCategory("ProfGen Options"); static cl::opt PerfScriptFilename( @@ -71,9 +74,6 @@ extern cl::opt ShowDisassemblyOnly; extern cl::opt ShowSourceLocations; extern cl::opt SkipSymbolization; -using namespace llvm; -using namespace sampleprof; - // Validate the command line input. static void validateCommandLine() { // Allow the missing perfscript if we only use to show binary disassembly. From 549d7c4f35a99598a269004ee13b237d2565b5ec Mon Sep 17 00:00:00 2001 From: Trevor Gross Date: Mon, 18 Aug 2025 13:56:24 -0500 Subject: [PATCH 20/27] [SPARC] Change `half` to use soft promotion rather than `PromoteFloat` (#152727) `half` currently uses the default legalization of promoting to a `f32`; however, this implementation implements math in a way that results in incorrect rounding. Switch to the soft promote implementation, which does not have this problem. The SPARC ABI does not specify a `_Float16` type, so there is no concern with keeping interface compatibility. Fixes the SPARC part of https://github.com/llvm/llvm-project/issues/97975 Fixes the SPARC part of https://github.com/llvm/llvm-project/issues/97981 --- llvm/lib/Target/Sparc/SparcISelLowering.h | 2 + llvm/test/CodeGen/Generic/half.ll | 4 +- llvm/test/CodeGen/SPARC/fp16-promote.ll | 76 ++--- llvm/test/CodeGen/SPARC/half.ll | 235 ++++----------- llvm/test/CodeGen/SPARC/llvm.sincos.ll | 339 ++++++++++++---------- 5 files changed, 297 insertions(+), 359 deletions(-) diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h index 4017beb88ff3..7fffb7c9823f 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.h +++ b/llvm/lib/Target/Sparc/SparcISelLowering.h @@ -28,6 +28,8 @@ namespace llvm { bool useSoftFloat() const override; + bool softPromoteHalfType() const override { return true; } + /// computeKnownBitsForTargetNode - Determine which of the bits specified /// in Mask are known to be either zero or one and return them in the /// KnownZero/KnownOne bitsets. diff --git a/llvm/test/CodeGen/Generic/half.ll b/llvm/test/CodeGen/Generic/half.ll index 9d6c8eb2730d..ef7bfe2f2d9c 100644 --- a/llvm/test/CodeGen/Generic/half.ll +++ b/llvm/test/CodeGen/Generic/half.ll @@ -34,8 +34,8 @@ ; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD %} ; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} ; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} -; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD %} -; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD %} +; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} ; RUN: %if spirv-registered-target %{ llc %s -o - -mtriple=spirv-unknown-unknown | FileCheck %s --check-prefixes=NOCRASH %} ; RUN: %if systemz-registered-target %{ llc %s -o - -mtriple=s390x-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} ; RUN: %if ve-registered-target %{ llc %s -o - -mtriple=ve-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD %} diff --git a/llvm/test/CodeGen/SPARC/fp16-promote.ll b/llvm/test/CodeGen/SPARC/fp16-promote.ll index efe67b04e8fb..64873b744de5 100644 --- a/llvm/test/CodeGen/SPARC/fp16-promote.ll +++ b/llvm/test/CodeGen/SPARC/fp16-promote.ll @@ -329,13 +329,14 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; V8-OPT-LABEL: test_fadd: ; V8-OPT: ! %bb.0: ; V8-OPT-NEXT: save %sp, -104, %sp -; V8-OPT-NEXT: call __extendhfsf2 -; V8-OPT-NEXT: lduh [%i0], %o0 -; V8-OPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill +; V8-OPT-NEXT: lduh [%i0], %i2 ; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i1], %o0 +; V8-OPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill +; V8-OPT-NEXT: call __extendhfsf2 +; V8-OPT-NEXT: mov %i2, %o0 ; V8-OPT-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload -; V8-OPT-NEXT: fadds %f1, %f0, %f0 +; V8-OPT-NEXT: fadds %f0, %f1, %f0 ; V8-OPT-NEXT: st %f0, [%fp+-4] ; V8-OPT-NEXT: call __truncsfhf2 ; V8-OPT-NEXT: ld [%fp+-4], %o0 @@ -346,13 +347,14 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; V8-UNOPT-LABEL: test_fadd: ; V8-UNOPT: ! %bb.0: ; V8-UNOPT-NEXT: save %sp, -104, %sp -; V8-UNOPT-NEXT: call __extendhfsf2 -; V8-UNOPT-NEXT: lduh [%i0], %o0 -; V8-UNOPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill +; V8-UNOPT-NEXT: lduh [%i0], %i2 +; V8-UNOPT-NEXT: st %i2, [%fp+-12] ! 4-byte Folded Spill ; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i1], %o0 -; V8-UNOPT-NEXT: fmovs %f0, %f1 -; V8-UNOPT-NEXT: ld [%fp+-8], %f0 ! 4-byte Folded Reload +; V8-UNOPT-NEXT: ld [%fp+-12], %o0 ! 4-byte Folded Reload +; V8-UNOPT-NEXT: call __extendhfsf2 +; V8-UNOPT-NEXT: st %f0, [%fp+-8] +; V8-UNOPT-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload ; V8-UNOPT-NEXT: fadds %f0, %f1, %f0 ; V8-UNOPT-NEXT: st %f0, [%fp+-4] ; V8-UNOPT-NEXT: call __truncsfhf2 @@ -364,13 +366,14 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; V9-LABEL: test_fadd: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -104, %sp -; V9-NEXT: call __extendhfsf2 -; V9-NEXT: lduh [%i0], %o0 -; V9-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill +; V9-NEXT: lduh [%i0], %i2 ; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i1], %o0 +; V9-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill +; V9-NEXT: call __extendhfsf2 +; V9-NEXT: mov %i2, %o0 ; V9-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload -; V9-NEXT: fadds %f1, %f0, %f0 +; V9-NEXT: fadds %f0, %f1, %f0 ; V9-NEXT: st %f0, [%fp+-4] ; V9-NEXT: call __truncsfhf2 ; V9-NEXT: ld [%fp+-4], %o0 @@ -381,14 +384,15 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; SPARC64-LABEL: test_fadd: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: call __extendhfsf2 -; SPARC64-NEXT: lduh [%i0], %o0 -; SPARC64-NEXT: st %f0, [%fp+2043] ! 4-byte Folded Spill +; SPARC64-NEXT: lduh [%i0], %i2 ; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i1], %o0 +; SPARC64-NEXT: st %f0, [%fp+2043] ! 4-byte Folded Spill +; SPARC64-NEXT: call __extendhfsf2 +; SPARC64-NEXT: mov %i2, %o0 ; SPARC64-NEXT: ld [%fp+2043], %f1 ! 4-byte Folded Reload ; SPARC64-NEXT: call __truncsfhf2 -; SPARC64-NEXT: fadds %f1, %f0, %f1 +; SPARC64-NEXT: fadds %f0, %f1, %f1 ; SPARC64-NEXT: sth %o0, [%i0] ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore @@ -403,13 +407,14 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; V8-OPT-LABEL: test_fmul: ; V8-OPT: ! %bb.0: ; V8-OPT-NEXT: save %sp, -104, %sp -; V8-OPT-NEXT: call __extendhfsf2 -; V8-OPT-NEXT: lduh [%i0], %o0 -; V8-OPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill +; V8-OPT-NEXT: lduh [%i0], %i2 ; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i1], %o0 +; V8-OPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill +; V8-OPT-NEXT: call __extendhfsf2 +; V8-OPT-NEXT: mov %i2, %o0 ; V8-OPT-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload -; V8-OPT-NEXT: fmuls %f1, %f0, %f0 +; V8-OPT-NEXT: fmuls %f0, %f1, %f0 ; V8-OPT-NEXT: st %f0, [%fp+-4] ; V8-OPT-NEXT: call __truncsfhf2 ; V8-OPT-NEXT: ld [%fp+-4], %o0 @@ -420,13 +425,14 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; V8-UNOPT-LABEL: test_fmul: ; V8-UNOPT: ! %bb.0: ; V8-UNOPT-NEXT: save %sp, -104, %sp -; V8-UNOPT-NEXT: call __extendhfsf2 -; V8-UNOPT-NEXT: lduh [%i0], %o0 -; V8-UNOPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill +; V8-UNOPT-NEXT: lduh [%i0], %i2 +; V8-UNOPT-NEXT: st %i2, [%fp+-12] ! 4-byte Folded Spill ; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i1], %o0 -; V8-UNOPT-NEXT: fmovs %f0, %f1 -; V8-UNOPT-NEXT: ld [%fp+-8], %f0 ! 4-byte Folded Reload +; V8-UNOPT-NEXT: ld [%fp+-12], %o0 ! 4-byte Folded Reload +; V8-UNOPT-NEXT: call __extendhfsf2 +; V8-UNOPT-NEXT: st %f0, [%fp+-8] +; V8-UNOPT-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload ; V8-UNOPT-NEXT: fmuls %f0, %f1, %f0 ; V8-UNOPT-NEXT: st %f0, [%fp+-4] ; V8-UNOPT-NEXT: call __truncsfhf2 @@ -438,13 +444,14 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; V9-LABEL: test_fmul: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -104, %sp -; V9-NEXT: call __extendhfsf2 -; V9-NEXT: lduh [%i0], %o0 -; V9-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill +; V9-NEXT: lduh [%i0], %i2 ; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i1], %o0 +; V9-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill +; V9-NEXT: call __extendhfsf2 +; V9-NEXT: mov %i2, %o0 ; V9-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload -; V9-NEXT: fmuls %f1, %f0, %f0 +; V9-NEXT: fmuls %f0, %f1, %f0 ; V9-NEXT: st %f0, [%fp+-4] ; V9-NEXT: call __truncsfhf2 ; V9-NEXT: ld [%fp+-4], %o0 @@ -455,14 +462,15 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; SPARC64-LABEL: test_fmul: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: call __extendhfsf2 -; SPARC64-NEXT: lduh [%i0], %o0 -; SPARC64-NEXT: st %f0, [%fp+2043] ! 4-byte Folded Spill +; SPARC64-NEXT: lduh [%i0], %i2 ; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i1], %o0 +; SPARC64-NEXT: st %f0, [%fp+2043] ! 4-byte Folded Spill +; SPARC64-NEXT: call __extendhfsf2 +; SPARC64-NEXT: mov %i2, %o0 ; SPARC64-NEXT: ld [%fp+2043], %f1 ! 4-byte Folded Reload ; SPARC64-NEXT: call __truncsfhf2 -; SPARC64-NEXT: fmuls %f1, %f0, %f1 +; SPARC64-NEXT: fmuls %f0, %f1, %f1 ; SPARC64-NEXT: sth %o0, [%i0] ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore diff --git a/llvm/test/CodeGen/SPARC/half.ll b/llvm/test/CodeGen/SPARC/half.ll index 34e2ceee28fc..565160149e71 100644 --- a/llvm/test/CodeGen/SPARC/half.ll +++ b/llvm/test/CodeGen/SPARC/half.ll @@ -9,43 +9,19 @@ ; copied from test/CodeGen/X86/half.ll. define void @store(half %x, ptr %p) nounwind { -; SPARC32-LABEL: store: -; SPARC32: ! %bb.0: -; SPARC32-NEXT: save %sp, -96, %sp -; SPARC32-NEXT: call __truncsfhf2 -; SPARC32-NEXT: mov %i0, %o0 -; SPARC32-NEXT: sth %o0, [%i1] -; SPARC32-NEXT: ret -; SPARC32-NEXT: restore -; -; SPARC64-LABEL: store: -; SPARC64: ! %bb.0: -; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __truncsfhf2 -; SPARC64-NEXT: nop -; SPARC64-NEXT: sth %o0, [%i1] -; SPARC64-NEXT: ret -; SPARC64-NEXT: restore +; CHECK-LABEL: store: +; CHECK: ! %bb.0: +; CHECK-NEXT: retl +; CHECK-NEXT: sth %o0, [%o1] store half %x, ptr %p ret void } define half @return(ptr %p) nounwind { -; SPARC32-LABEL: return: -; SPARC32: ! %bb.0: -; SPARC32-NEXT: save %sp, -96, %sp -; SPARC32-NEXT: call __extendhfsf2 -; SPARC32-NEXT: lduh [%i0], %o0 -; SPARC32-NEXT: ret -; SPARC32-NEXT: restore -; -; SPARC64-LABEL: return: -; SPARC64: ! %bb.0: -; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __extendhfsf2 -; SPARC64-NEXT: lduh [%i0], %o0 -; SPARC64-NEXT: ret -; SPARC64-NEXT: restore +; CHECK-LABEL: return: +; CHECK: ! %bb.0: +; CHECK-NEXT: retl +; CHECK-NEXT: lduh [%o0], %o0 %r = load half, ptr %p ret half %r } @@ -185,46 +161,19 @@ define void @test_bitcast_to_half(ptr %addr, i16 %in) nounwind { } define half @from_bits(i16 %x) nounwind { -; SPARC32-LABEL: from_bits: -; SPARC32: ! %bb.0: -; SPARC32-NEXT: save %sp, -96, %sp -; SPARC32-NEXT: call __extendhfsf2 -; SPARC32-NEXT: mov %i0, %o0 -; SPARC32-NEXT: ret -; SPARC32-NEXT: restore -; -; SPARC64-LABEL: from_bits: -; SPARC64: ! %bb.0: -; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __extendhfsf2 -; SPARC64-NEXT: srl %i0, 0, %o0 -; SPARC64-NEXT: ret -; SPARC64-NEXT: restore +; CHECK-LABEL: from_bits: +; CHECK: ! %bb.0: +; CHECK-NEXT: retl +; CHECK-NEXT: nop %res = bitcast i16 %x to half ret half %res } define i16 @to_bits(half %x) nounwind { -; SPARC32-LABEL: to_bits: -; SPARC32: ! %bb.0: -; SPARC32-NEXT: save %sp, -96, %sp -; SPARC32-NEXT: call __truncsfhf2 -; SPARC32-NEXT: mov %i0, %o0 -; SPARC32-NEXT: sethi 4194240, %i0 -; SPARC32-NEXT: andn %o0, %i0, %i0 -; SPARC32-NEXT: ret -; SPARC32-NEXT: restore -; -; SPARC64-LABEL: to_bits: -; SPARC64: ! %bb.0: -; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __truncsfhf2 -; SPARC64-NEXT: nop -; SPARC64-NEXT: sethi 63, %i0 -; SPARC64-NEXT: or %i0, 1023, %i0 -; SPARC64-NEXT: and %o0, %i0, %i0 -; SPARC64-NEXT: ret -; SPARC64-NEXT: restore +; CHECK-LABEL: to_bits: +; CHECK: ! %bb.0: +; CHECK-NEXT: retl +; CHECK-NEXT: nop %res = bitcast half %x to i16 ret i16 %res } @@ -694,37 +643,47 @@ define void @test_trunc64_vec4(<4 x double> %a, ptr %p) nounwind { define float @test_sitofp_fadd_i32(i32 %a, ptr %b) nounwind { ; SPARC32-LABEL: test_sitofp_fadd_i32: ; SPARC32: ! %bb.0: -; SPARC32-NEXT: save %sp, -104, %sp -; SPARC32-NEXT: call __extendhfsf2 -; SPARC32-NEXT: lduh [%i1], %o0 +; SPARC32-NEXT: save %sp, -112, %sp +; SPARC32-NEXT: lduh [%i1], %i1 ; SPARC32-NEXT: st %i0, [%fp+-4] -; SPARC32-NEXT: ld [%fp+-4], %f1 -; SPARC32-NEXT: st %f0, [%fp+-12] ! 4-byte Folded Spill -; SPARC32-NEXT: fitos %f1, %f0 +; SPARC32-NEXT: ld [%fp+-4], %f0 +; SPARC32-NEXT: fitos %f0, %f0 ; SPARC32-NEXT: st %f0, [%fp+-8] ; SPARC32-NEXT: call __truncsfhf2 ; SPARC32-NEXT: ld [%fp+-8], %o0 ; SPARC32-NEXT: call __extendhfsf2 ; SPARC32-NEXT: nop -; SPARC32-NEXT: ld [%fp+-12], %f1 ! 4-byte Folded Reload -; SPARC32-NEXT: fadds %f1, %f0, %f0 +; SPARC32-NEXT: st %f0, [%fp+-16] ! 4-byte Folded Spill +; SPARC32-NEXT: call __extendhfsf2 +; SPARC32-NEXT: mov %i1, %o0 +; SPARC32-NEXT: ld [%fp+-16], %f1 ! 4-byte Folded Reload +; SPARC32-NEXT: fadds %f0, %f1, %f0 +; SPARC32-NEXT: st %f0, [%fp+-12] +; SPARC32-NEXT: call __truncsfhf2 +; SPARC32-NEXT: ld [%fp+-12], %o0 +; SPARC32-NEXT: call __extendhfsf2 +; SPARC32-NEXT: nop ; SPARC32-NEXT: ret ; SPARC32-NEXT: restore ; ; SPARC64-LABEL: test_sitofp_fadd_i32: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: call __extendhfsf2 -; SPARC64-NEXT: lduh [%i1], %o0 -; SPARC64-NEXT: st %f0, [%fp+2039] ! 4-byte Folded Spill +; SPARC64-NEXT: lduh [%i1], %i1 ; SPARC64-NEXT: st %i0, [%fp+2043] ; SPARC64-NEXT: ld [%fp+2043], %f0 ; SPARC64-NEXT: call __truncsfhf2 ; SPARC64-NEXT: fitos %f0, %f1 ; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: nop +; SPARC64-NEXT: st %f0, [%fp+2039] ! 4-byte Folded Spill +; SPARC64-NEXT: call __extendhfsf2 +; SPARC64-NEXT: mov %i1, %o0 ; SPARC64-NEXT: ld [%fp+2039], %f1 ! 4-byte Folded Reload -; SPARC64-NEXT: fadds %f1, %f0, %f0 +; SPARC64-NEXT: call __truncsfhf2 +; SPARC64-NEXT: fadds %f0, %f1, %f1 +; SPARC64-NEXT: call __extendhfsf2 +; SPARC64-NEXT: nop ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore %tmp0 = load half, ptr %b @@ -738,10 +697,8 @@ define half @PR40273(half) nounwind { ; V8-LABEL: PR40273: ; V8: ! %bb.0: ; V8-NEXT: save %sp, -96, %sp -; V8-NEXT: call __truncsfhf2 -; V8-NEXT: mov %i0, %o0 ; V8-NEXT: call __extendhfsf2 -; V8-NEXT: nop +; V8-NEXT: mov %i0, %o0 ; V8-NEXT: sethi %hi(.LCPI24_0), %i0 ; V8-NEXT: ld [%i0+%lo(.LCPI24_0)], %f1 ; V8-NEXT: fcmps %f0, %f1 @@ -749,54 +706,40 @@ define half @PR40273(half) nounwind { ; V8-NEXT: fbne .LBB24_2 ; V8-NEXT: nop ; V8-NEXT: ! %bb.1: -; V8-NEXT: ba .LBB24_3 -; V8-NEXT: mov %g0, %i0 +; V8-NEXT: ret +; V8-NEXT: restore %g0, %g0, %o0 ; V8-NEXT: .LBB24_2: -; V8-NEXT: mov 4, %i0 -; V8-NEXT: .LBB24_3: -; V8-NEXT: sethi %hi(.LCPI24_1), %i1 -; V8-NEXT: add %i1, %lo(.LCPI24_1), %i1 -; V8-NEXT: ld [%i1+%i0], %f0 +; V8-NEXT: sethi 15, %i0 ; V8-NEXT: ret ; V8-NEXT: restore ; ; V9-LABEL: PR40273: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -96, %sp -; V9-NEXT: call __truncsfhf2 -; V9-NEXT: mov %i0, %o0 ; V9-NEXT: call __extendhfsf2 -; V9-NEXT: nop +; V9-NEXT: mov %i0, %o0 ; V9-NEXT: sethi %hi(.LCPI24_0), %i0 ; V9-NEXT: ld [%i0+%lo(.LCPI24_0)], %f1 ; V9-NEXT: mov %g0, %i0 +; V9-NEXT: sethi 15, %i1 ; V9-NEXT: fcmps %fcc0, %f0, %f1 -; V9-NEXT: movne %fcc0, 4, %i0 -; V9-NEXT: sethi %hi(.LCPI24_1), %i1 -; V9-NEXT: add %i1, %lo(.LCPI24_1), %i1 -; V9-NEXT: ld [%i1+%i0], %f0 +; V9-NEXT: movne %fcc0, %i1, %i0 ; V9-NEXT: ret ; V9-NEXT: restore ; ; SPARC64-LABEL: PR40273: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __truncsfhf2 -; SPARC64-NEXT: nop ; SPARC64-NEXT: call __extendhfsf2 -; SPARC64-NEXT: nop +; SPARC64-NEXT: srl %i0, 0, %o0 ; SPARC64-NEXT: sethi %h44(.LCPI24_0), %i0 ; SPARC64-NEXT: add %i0, %m44(.LCPI24_0), %i0 ; SPARC64-NEXT: sllx %i0, 12, %i0 ; SPARC64-NEXT: ld [%i0+%l44(.LCPI24_0)], %f1 ; SPARC64-NEXT: mov %g0, %i0 +; SPARC64-NEXT: sethi 15, %i1 ; SPARC64-NEXT: fcmps %fcc0, %f0, %f1 -; SPARC64-NEXT: movne %fcc0, 4, %i0 -; SPARC64-NEXT: sethi %h44(.LCPI24_1), %i1 -; SPARC64-NEXT: add %i1, %m44(.LCPI24_1), %i1 -; SPARC64-NEXT: sllx %i1, 12, %i1 -; SPARC64-NEXT: add %i1, %l44(.LCPI24_1), %i1 -; SPARC64-NEXT: ld [%i1+%i0], %f0 +; SPARC64-NEXT: movne %fcc0, %i1, %i0 ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore %2 = fcmp une half %0, 0xH0000 @@ -807,82 +750,28 @@ define half @PR40273(half) nounwind { define half @fabs(half %x) nounwind { ; SPARC32-LABEL: fabs: ; SPARC32: ! %bb.0: -; SPARC32-NEXT: save %sp, -96, %sp -; SPARC32-NEXT: call __truncsfhf2 -; SPARC32-NEXT: mov %i0, %o0 -; SPARC32-NEXT: call __extendhfsf2 -; SPARC32-NEXT: nop -; SPARC32-NEXT: fabss %f0, %f0 -; SPARC32-NEXT: ret -; SPARC32-NEXT: restore +; SPARC32-NEXT: sethi 4194272, %o1 +; SPARC32-NEXT: retl +; SPARC32-NEXT: andn %o0, %o1, %o0 ; ; SPARC64-LABEL: fabs: ; SPARC64: ! %bb.0: -; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __truncsfhf2 -; SPARC64-NEXT: nop -; SPARC64-NEXT: call __extendhfsf2 -; SPARC64-NEXT: nop -; SPARC64-NEXT: fabss %f0, %f0 -; SPARC64-NEXT: ret -; SPARC64-NEXT: restore +; SPARC64-NEXT: sethi 31, %o1 +; SPARC64-NEXT: or %o1, 1023, %o1 +; SPARC64-NEXT: retl +; SPARC64-NEXT: and %o0, %o1, %o0 %a = call half @llvm.fabs.f16(half %x) ret half %a } define half @fcopysign(half %x, half %y) nounwind { -; V8-LABEL: fcopysign: -; V8: ! %bb.0: -; V8-NEXT: save %sp, -96, %sp -; V8-NEXT: call __truncsfhf2 -; V8-NEXT: mov %i0, %o0 -; V8-NEXT: call __extendhfsf2 -; V8-NEXT: nop -; V8-NEXT: sethi 2097152, %i0 -; V8-NEXT: and %i1, %i0, %i0 -; V8-NEXT: cmp %i0, 0 -; V8-NEXT: be .LBB26_2 -; V8-NEXT: fabss %f0, %f0 -; V8-NEXT: ! %bb.1: -; V8-NEXT: fnegs %f0, %f0 -; V8-NEXT: .LBB26_2: -; V8-NEXT: ret -; V8-NEXT: restore -; -; V9-LABEL: fcopysign: -; V9: ! %bb.0: -; V9-NEXT: save %sp, -96, %sp -; V9-NEXT: call __truncsfhf2 -; V9-NEXT: mov %i0, %o0 -; V9-NEXT: call __extendhfsf2 -; V9-NEXT: nop -; V9-NEXT: sethi 2097152, %i0 -; V9-NEXT: and %i1, %i0, %i0 -; V9-NEXT: fabss %f0, %f0 -; V9-NEXT: fnegs %f0, %f1 -; V9-NEXT: cmp %i0, 0 -; V9-NEXT: fmovsne %icc, %f1, %f0 -; V9-NEXT: ret -; V9-NEXT: restore -; -; SPARC64-LABEL: fcopysign: -; SPARC64: ! %bb.0: -; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: call __truncsfhf2 -; SPARC64-NEXT: st %f3, [%fp+2039] -; SPARC64-NEXT: call __extendhfsf2 -; SPARC64-NEXT: nop -; SPARC64-NEXT: ld [%fp+2039], %f1 ! 4-byte Folded Reload -; SPARC64-NEXT: st %f1, [%fp+2043] -; SPARC64-NEXT: ld [%fp+2043], %i0 -; SPARC64-NEXT: sethi 2097152, %i1 -; SPARC64-NEXT: and %i0, %i1, %i0 -; SPARC64-NEXT: fabss %f0, %f0 -; SPARC64-NEXT: fnegs %f0, %f1 -; SPARC64-NEXT: cmp %i0, 0 -; SPARC64-NEXT: fmovsne %icc, %f1, %f0 -; SPARC64-NEXT: ret -; SPARC64-NEXT: restore +; CHECK-LABEL: fcopysign: +; CHECK: ! %bb.0: +; CHECK-NEXT: sethi 4194272, %o2 +; CHECK-NEXT: and %o1, %o2, %o1 +; CHECK-NEXT: andn %o0, %o2, %o0 +; CHECK-NEXT: retl +; CHECK-NEXT: or %o0, %o1, %o0 %a = call half @llvm.copysign.f16(half %x, half %y) ret half %a } diff --git a/llvm/test/CodeGen/SPARC/llvm.sincos.ll b/llvm/test/CodeGen/SPARC/llvm.sincos.ll index 87b9c8e7ba47..8d0d50f67e3f 100644 --- a/llvm/test/CodeGen/SPARC/llvm.sincos.ll +++ b/llvm/test/CodeGen/SPARC/llvm.sincos.ll @@ -10,74 +10,84 @@ define { half, half } @test_sincos_f16(half %a) #0 { ; SPARC32-LABEL: test_sincos_f16: ; SPARC32: ! %bb.0: ; SPARC32-NEXT: save %sp, -104, %sp -; SPARC32-NEXT: call __truncsfhf2 -; SPARC32-NEXT: mov %i0, %o0 ; SPARC32-NEXT: call __extendhfsf2 -; SPARC32-NEXT: nop -; SPARC32-NEXT: st %f0, [%fp+-4] -; SPARC32-NEXT: ld [%fp+-4], %i0 +; SPARC32-NEXT: mov %i0, %o0 +; SPARC32-NEXT: st %f0, [%fp+-12] +; SPARC32-NEXT: ld [%fp+-12], %i0 ; SPARC32-NEXT: call sinf ; SPARC32-NEXT: mov %i0, %o0 -; SPARC32-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill +; SPARC32-NEXT: st %f0, [%fp+-8] ; SPARC32-NEXT: call cosf ; SPARC32-NEXT: mov %i0, %o0 -; SPARC32-NEXT: fmovs %f0, %f1 -; SPARC32-NEXT: ld [%fp+-8], %f0 ! 4-byte Folded Reload +; SPARC32-NEXT: st %f0, [%fp+-4] +; SPARC32-NEXT: call __truncsfhf2 +; SPARC32-NEXT: ld [%fp+-8], %o0 +; SPARC32-NEXT: mov %o0, %i0 +; SPARC32-NEXT: call __truncsfhf2 +; SPARC32-NEXT: ld [%fp+-4], %o0 ; SPARC32-NEXT: ret -; SPARC32-NEXT: restore +; SPARC32-NEXT: restore %g0, %o0, %o1 ; ; SPARC64-LABEL: test_sincos_f16: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: call __truncsfhf2 -; SPARC64-NEXT: nop ; SPARC64-NEXT: call __extendhfsf2 -; SPARC64-NEXT: nop -; SPARC64-NEXT: st %f0, [%fp+2039] ! 4-byte Folded Spill +; SPARC64-NEXT: srl %i0, 0, %o0 +; SPARC64-NEXT: st %f0, [%fp+2043] ! 4-byte Folded Spill ; SPARC64-NEXT: fmovs %f0, %f1 ; SPARC64-NEXT: call sinf ; SPARC64-NEXT: nop -; SPARC64-NEXT: st %f0, [%fp+2043] ! 4-byte Folded Spill -; SPARC64-NEXT: call cosf -; SPARC64-NEXT: ld [%fp+2039], %f1 ; SPARC64-NEXT: fmovs %f0, %f1 -; SPARC64-NEXT: ld [%fp+2043], %f0 ! 4-byte Folded Reload +; SPARC64-NEXT: call __truncsfhf2 +; SPARC64-NEXT: nop +; SPARC64-NEXT: mov %o0, %i0 +; SPARC64-NEXT: call cosf +; SPARC64-NEXT: ld [%fp+2043], %f1 +; SPARC64-NEXT: fmovs %f0, %f1 +; SPARC64-NEXT: call __truncsfhf2 +; SPARC64-NEXT: nop ; SPARC64-NEXT: ret -; SPARC64-NEXT: restore +; SPARC64-NEXT: restore %g0, %o0, %o1 ; ; GNU32-LABEL: test_sincos_f16: ; GNU32: ! %bb.0: -; GNU32-NEXT: save %sp, -104, %sp -; GNU32-NEXT: call __truncsfhf2 -; GNU32-NEXT: mov %i0, %o0 +; GNU32-NEXT: save %sp, -112, %sp ; GNU32-NEXT: call __extendhfsf2 -; GNU32-NEXT: nop +; GNU32-NEXT: mov %i0, %o0 ; GNU32-NEXT: st %f0, [%fp+-12] ; GNU32-NEXT: ld [%fp+-12], %o0 ; GNU32-NEXT: add %fp, -4, %o1 ; GNU32-NEXT: call sincosf ; GNU32-NEXT: add %fp, -8, %o2 ; GNU32-NEXT: ld [%fp+-4], %f0 -; GNU32-NEXT: ld [%fp+-8], %f1 +; GNU32-NEXT: st %f0, [%fp+-20] +; GNU32-NEXT: ld [%fp+-8], %f0 +; GNU32-NEXT: st %f0, [%fp+-16] +; GNU32-NEXT: call __truncsfhf2 +; GNU32-NEXT: ld [%fp+-20], %o0 +; GNU32-NEXT: mov %o0, %i0 +; GNU32-NEXT: call __truncsfhf2 +; GNU32-NEXT: ld [%fp+-16], %o0 ; GNU32-NEXT: ret -; GNU32-NEXT: restore +; GNU32-NEXT: restore %g0, %o0, %o1 ; ; GNU64-LABEL: test_sincos_f16: ; GNU64: ! %bb.0: ; GNU64-NEXT: save %sp, -192, %sp -; GNU64-NEXT: call __truncsfhf2 -; GNU64-NEXT: nop ; GNU64-NEXT: call __extendhfsf2 -; GNU64-NEXT: nop +; GNU64-NEXT: srl %i0, 0, %o0 ; GNU64-NEXT: add %fp, 2043, %o1 ; GNU64-NEXT: add %fp, 2039, %o2 ; GNU64-NEXT: fmovs %f0, %f1 ; GNU64-NEXT: call sincosf ; GNU64-NEXT: nop -; GNU64-NEXT: ld [%fp+2043], %f0 +; GNU64-NEXT: call __truncsfhf2 +; GNU64-NEXT: ld [%fp+2043], %f1 +; GNU64-NEXT: mov %o0, %i0 +; GNU64-NEXT: call __truncsfhf2 ; GNU64-NEXT: ld [%fp+2039], %f1 ; GNU64-NEXT: ret -; GNU64-NEXT: restore +; GNU64-NEXT: restore %g0, %o0, %o1 %result = call { half, half } @llvm.sincos.f16(half %a) ret { half, half } %result } @@ -85,61 +95,63 @@ define { half, half } @test_sincos_f16(half %a) #0 { define half @test_sincos_f16_only_use_sin(half %a) #0 { ; SPARC32-LABEL: test_sincos_f16_only_use_sin: ; SPARC32: ! %bb.0: -; SPARC32-NEXT: save %sp, -96, %sp -; SPARC32-NEXT: call __truncsfhf2 -; SPARC32-NEXT: mov %i0, %o0 +; SPARC32-NEXT: save %sp, -104, %sp ; SPARC32-NEXT: call __extendhfsf2 -; SPARC32-NEXT: nop -; SPARC32-NEXT: st %f0, [%fp+-4] +; SPARC32-NEXT: mov %i0, %o0 +; SPARC32-NEXT: st %f0, [%fp+-8] ; SPARC32-NEXT: call sinf +; SPARC32-NEXT: ld [%fp+-8], %o0 +; SPARC32-NEXT: st %f0, [%fp+-4] +; SPARC32-NEXT: call __truncsfhf2 ; SPARC32-NEXT: ld [%fp+-4], %o0 ; SPARC32-NEXT: ret -; SPARC32-NEXT: restore +; SPARC32-NEXT: restore %g0, %o0, %o0 ; ; SPARC64-LABEL: test_sincos_f16_only_use_sin: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __truncsfhf2 -; SPARC64-NEXT: nop ; SPARC64-NEXT: call __extendhfsf2 -; SPARC64-NEXT: nop +; SPARC64-NEXT: srl %i0, 0, %o0 ; SPARC64-NEXT: fmovs %f0, %f1 ; SPARC64-NEXT: call sinf ; SPARC64-NEXT: nop +; SPARC64-NEXT: fmovs %f0, %f1 +; SPARC64-NEXT: call __truncsfhf2 +; SPARC64-NEXT: nop ; SPARC64-NEXT: ret -; SPARC64-NEXT: restore +; SPARC64-NEXT: restore %g0, %o0, %o0 ; ; GNU32-LABEL: test_sincos_f16_only_use_sin: ; GNU32: ! %bb.0: -; GNU32-NEXT: save %sp, -104, %sp -; GNU32-NEXT: call __truncsfhf2 -; GNU32-NEXT: mov %i0, %o0 +; GNU32-NEXT: save %sp, -112, %sp ; GNU32-NEXT: call __extendhfsf2 -; GNU32-NEXT: nop +; GNU32-NEXT: mov %i0, %o0 ; GNU32-NEXT: st %f0, [%fp+-12] ; GNU32-NEXT: ld [%fp+-12], %o0 ; GNU32-NEXT: add %fp, -4, %o1 ; GNU32-NEXT: call sincosf ; GNU32-NEXT: add %fp, -8, %o2 ; GNU32-NEXT: ld [%fp+-4], %f0 +; GNU32-NEXT: st %f0, [%fp+-16] +; GNU32-NEXT: call __truncsfhf2 +; GNU32-NEXT: ld [%fp+-16], %o0 ; GNU32-NEXT: ret -; GNU32-NEXT: restore +; GNU32-NEXT: restore %g0, %o0, %o0 ; ; GNU64-LABEL: test_sincos_f16_only_use_sin: ; GNU64: ! %bb.0: ; GNU64-NEXT: save %sp, -192, %sp -; GNU64-NEXT: call __truncsfhf2 -; GNU64-NEXT: nop ; GNU64-NEXT: call __extendhfsf2 -; GNU64-NEXT: nop +; GNU64-NEXT: srl %i0, 0, %o0 ; GNU64-NEXT: add %fp, 2043, %o1 ; GNU64-NEXT: add %fp, 2039, %o2 ; GNU64-NEXT: fmovs %f0, %f1 ; GNU64-NEXT: call sincosf ; GNU64-NEXT: nop -; GNU64-NEXT: ld [%fp+2043], %f0 +; GNU64-NEXT: call __truncsfhf2 +; GNU64-NEXT: ld [%fp+2043], %f1 ; GNU64-NEXT: ret -; GNU64-NEXT: restore +; GNU64-NEXT: restore %g0, %o0, %o0 %result = call { half, half } @llvm.sincos.f16(half %a) %result.0 = extractvalue { half, half } %result, 0 ret half %result.0 @@ -148,61 +160,63 @@ define half @test_sincos_f16_only_use_sin(half %a) #0 { define half @test_sincos_f16_only_use_cos(half %a) #0 { ; SPARC32-LABEL: test_sincos_f16_only_use_cos: ; SPARC32: ! %bb.0: -; SPARC32-NEXT: save %sp, -96, %sp -; SPARC32-NEXT: call __truncsfhf2 -; SPARC32-NEXT: mov %i0, %o0 +; SPARC32-NEXT: save %sp, -104, %sp ; SPARC32-NEXT: call __extendhfsf2 -; SPARC32-NEXT: nop -; SPARC32-NEXT: st %f0, [%fp+-4] +; SPARC32-NEXT: mov %i0, %o0 +; SPARC32-NEXT: st %f0, [%fp+-8] ; SPARC32-NEXT: call cosf +; SPARC32-NEXT: ld [%fp+-8], %o0 +; SPARC32-NEXT: st %f0, [%fp+-4] +; SPARC32-NEXT: call __truncsfhf2 ; SPARC32-NEXT: ld [%fp+-4], %o0 ; SPARC32-NEXT: ret -; SPARC32-NEXT: restore +; SPARC32-NEXT: restore %g0, %o0, %o0 ; ; SPARC64-LABEL: test_sincos_f16_only_use_cos: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __truncsfhf2 -; SPARC64-NEXT: nop ; SPARC64-NEXT: call __extendhfsf2 -; SPARC64-NEXT: nop +; SPARC64-NEXT: srl %i0, 0, %o0 ; SPARC64-NEXT: fmovs %f0, %f1 ; SPARC64-NEXT: call cosf ; SPARC64-NEXT: nop +; SPARC64-NEXT: fmovs %f0, %f1 +; SPARC64-NEXT: call __truncsfhf2 +; SPARC64-NEXT: nop ; SPARC64-NEXT: ret -; SPARC64-NEXT: restore +; SPARC64-NEXT: restore %g0, %o0, %o0 ; ; GNU32-LABEL: test_sincos_f16_only_use_cos: ; GNU32: ! %bb.0: -; GNU32-NEXT: save %sp, -104, %sp -; GNU32-NEXT: call __truncsfhf2 -; GNU32-NEXT: mov %i0, %o0 +; GNU32-NEXT: save %sp, -112, %sp ; GNU32-NEXT: call __extendhfsf2 -; GNU32-NEXT: nop +; GNU32-NEXT: mov %i0, %o0 ; GNU32-NEXT: st %f0, [%fp+-12] ; GNU32-NEXT: ld [%fp+-12], %o0 ; GNU32-NEXT: add %fp, -4, %o1 ; GNU32-NEXT: call sincosf ; GNU32-NEXT: add %fp, -8, %o2 ; GNU32-NEXT: ld [%fp+-8], %f0 +; GNU32-NEXT: st %f0, [%fp+-16] +; GNU32-NEXT: call __truncsfhf2 +; GNU32-NEXT: ld [%fp+-16], %o0 ; GNU32-NEXT: ret -; GNU32-NEXT: restore +; GNU32-NEXT: restore %g0, %o0, %o0 ; ; GNU64-LABEL: test_sincos_f16_only_use_cos: ; GNU64: ! %bb.0: ; GNU64-NEXT: save %sp, -192, %sp -; GNU64-NEXT: call __truncsfhf2 -; GNU64-NEXT: nop ; GNU64-NEXT: call __extendhfsf2 -; GNU64-NEXT: nop +; GNU64-NEXT: srl %i0, 0, %o0 ; GNU64-NEXT: add %fp, 2043, %o1 ; GNU64-NEXT: add %fp, 2039, %o2 ; GNU64-NEXT: fmovs %f0, %f1 ; GNU64-NEXT: call sincosf ; GNU64-NEXT: nop -; GNU64-NEXT: ld [%fp+2039], %f0 +; GNU64-NEXT: call __truncsfhf2 +; GNU64-NEXT: ld [%fp+2039], %f1 ; GNU64-NEXT: ret -; GNU64-NEXT: restore +; GNU64-NEXT: restore %g0, %o0, %o0 %result = call { half, half } @llvm.sincos.f16(half %a) %result.1 = extractvalue { half, half } %result, 1 ret half %result.1 @@ -211,132 +225,157 @@ define half @test_sincos_f16_only_use_cos(half %a) #0 { define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) #0 { ; SPARC32-LABEL: test_sincos_v2f16: ; SPARC32: ! %bb.0: -; SPARC32-NEXT: save %sp, -112, %sp -; SPARC32-NEXT: call __truncsfhf2 -; SPARC32-NEXT: mov %i1, %o0 +; SPARC32-NEXT: save %sp, -128, %sp ; SPARC32-NEXT: call __extendhfsf2 -; SPARC32-NEXT: nop -; SPARC32-NEXT: st %f0, [%fp+-12] ! 4-byte Folded Spill -; SPARC32-NEXT: call __truncsfhf2 -; SPARC32-NEXT: mov %i0, %o0 -; SPARC32-NEXT: call __extendhfsf2 -; SPARC32-NEXT: nop -; SPARC32-NEXT: st %f0, [%fp+-8] -; SPARC32-NEXT: ld [%fp+-12], %f0 ! 4-byte Folded Reload -; SPARC32-NEXT: st %f0, [%fp+-4] -; SPARC32-NEXT: ld [%fp+-8], %i0 -; SPARC32-NEXT: call sinf -; SPARC32-NEXT: mov %i0, %o0 -; SPARC32-NEXT: st %f0, [%fp+-12] ! 4-byte Folded Spill -; SPARC32-NEXT: ld [%fp+-4], %i1 -; SPARC32-NEXT: call sinf ; SPARC32-NEXT: mov %i1, %o0 -; SPARC32-NEXT: st %f0, [%fp+-16] ! 4-byte Folded Spill +; SPARC32-NEXT: st %f0, [%fp+-28] +; SPARC32-NEXT: call __extendhfsf2 +; SPARC32-NEXT: mov %i0, %o0 +; SPARC32-NEXT: st %f0, [%fp+-32] +; SPARC32-NEXT: ld [%fp+-28], %i0 ; SPARC32-NEXT: call cosf ; SPARC32-NEXT: mov %i0, %o0 -; SPARC32-NEXT: st %f0, [%fp+-20] ! 4-byte Folded Spill +; SPARC32-NEXT: st %f0, [%fp+-20] +; SPARC32-NEXT: ld [%fp+-32], %i1 ; SPARC32-NEXT: call cosf ; SPARC32-NEXT: mov %i1, %o0 -; SPARC32-NEXT: fmovs %f0, %f3 -; SPARC32-NEXT: ld [%fp+-12], %f0 ! 4-byte Folded Reload -; SPARC32-NEXT: ld [%fp+-16], %f1 ! 4-byte Folded Reload -; SPARC32-NEXT: ld [%fp+-20], %f2 ! 4-byte Folded Reload +; SPARC32-NEXT: st %f0, [%fp+-12] +; SPARC32-NEXT: call sinf +; SPARC32-NEXT: mov %i0, %o0 +; SPARC32-NEXT: st %f0, [%fp+-24] +; SPARC32-NEXT: call sinf +; SPARC32-NEXT: mov %i1, %o0 +; SPARC32-NEXT: st %f0, [%fp+-16] +; SPARC32-NEXT: call __truncsfhf2 +; SPARC32-NEXT: ld [%fp+-20], %o0 +; SPARC32-NEXT: sethi 63, %i0 +; SPARC32-NEXT: or %i0, 1023, %i0 +; SPARC32-NEXT: and %o0, %i0, %i4 +; SPARC32-NEXT: call __truncsfhf2 +; SPARC32-NEXT: ld [%fp+-12], %o0 +; SPARC32-NEXT: and %o0, %i0, %i2 +; SPARC32-NEXT: call __truncsfhf2 +; SPARC32-NEXT: ld [%fp+-24], %o0 +; SPARC32-NEXT: and %o0, %i0, %i1 +; SPARC32-NEXT: call __truncsfhf2 +; SPARC32-NEXT: ld [%fp+-16], %o0 +; SPARC32-NEXT: and %o0, %i0, %g2 +; SPARC32-NEXT: mov %g2, %i0 +; SPARC32-NEXT: ! kill: def $i2 killed $i2 killed $i2_i3 ; SPARC32-NEXT: ret -; SPARC32-NEXT: restore +; SPARC32-NEXT: restore %g0, %i4, %o3 ; ; SPARC64-LABEL: test_sincos_v2f16: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: st %f1, [%fp+2039] ! 4-byte Folded Spill -; SPARC64-NEXT: fmovs %f3, %f1 -; SPARC64-NEXT: call __truncsfhf2 -; SPARC64-NEXT: nop ; SPARC64-NEXT: call __extendhfsf2 -; SPARC64-NEXT: nop +; SPARC64-NEXT: srl %i0, 0, %o0 ; SPARC64-NEXT: st %f0, [%fp+2043] ! 4-byte Folded Spill -; SPARC64-NEXT: call __truncsfhf2 -; SPARC64-NEXT: ld [%fp+2039], %f1 -; SPARC64-NEXT: call __extendhfsf2 -; SPARC64-NEXT: nop -; SPARC64-NEXT: st %f0, [%fp+2031] ! 4-byte Folded Spill ; SPARC64-NEXT: fmovs %f0, %f1 ; SPARC64-NEXT: call sinf ; SPARC64-NEXT: nop +; SPARC64-NEXT: fmovs %f0, %f1 +; SPARC64-NEXT: call __truncsfhf2 +; SPARC64-NEXT: nop +; SPARC64-NEXT: mov %o0, %i0 +; SPARC64-NEXT: call __extendhfsf2 +; SPARC64-NEXT: srl %i1, 0, %o0 ; SPARC64-NEXT: st %f0, [%fp+2039] ! 4-byte Folded Spill +; SPARC64-NEXT: fmovs %f0, %f1 ; SPARC64-NEXT: call sinf -; SPARC64-NEXT: ld [%fp+2043], %f1 -; SPARC64-NEXT: st %f0, [%fp+2035] ! 4-byte Folded Spill -; SPARC64-NEXT: call cosf -; SPARC64-NEXT: ld [%fp+2031], %f1 -; SPARC64-NEXT: st %f0, [%fp+2031] ! 4-byte Folded Spill +; SPARC64-NEXT: nop +; SPARC64-NEXT: fmovs %f0, %f1 +; SPARC64-NEXT: call __truncsfhf2 +; SPARC64-NEXT: nop +; SPARC64-NEXT: mov %o0, %i1 ; SPARC64-NEXT: call cosf ; SPARC64-NEXT: ld [%fp+2043], %f1 -; SPARC64-NEXT: fmovs %f0, %f3 -; SPARC64-NEXT: ld [%fp+2039], %f0 ! 4-byte Folded Reload -; SPARC64-NEXT: ld [%fp+2035], %f1 ! 4-byte Folded Reload -; SPARC64-NEXT: ld [%fp+2031], %f2 ! 4-byte Folded Reload +; SPARC64-NEXT: fmovs %f0, %f1 +; SPARC64-NEXT: call __truncsfhf2 +; SPARC64-NEXT: nop +; SPARC64-NEXT: mov %o0, %i2 +; SPARC64-NEXT: call cosf +; SPARC64-NEXT: ld [%fp+2039], %f1 +; SPARC64-NEXT: fmovs %f0, %f1 +; SPARC64-NEXT: call __truncsfhf2 +; SPARC64-NEXT: nop ; SPARC64-NEXT: ret -; SPARC64-NEXT: restore +; SPARC64-NEXT: restore %g0, %o0, %o3 ; ; GNU32-LABEL: test_sincos_v2f16: ; GNU32: ! %bb.0: -; GNU32-NEXT: save %sp, -120, %sp -; GNU32-NEXT: call __truncsfhf2 +; GNU32-NEXT: save %sp, -144, %sp +; GNU32-NEXT: call __extendhfsf2 ; GNU32-NEXT: mov %i1, %o0 -; GNU32-NEXT: call __extendhfsf2 -; GNU32-NEXT: nop -; GNU32-NEXT: st %f0, [%fp+-28] ! 4-byte Folded Spill -; GNU32-NEXT: call __truncsfhf2 -; GNU32-NEXT: mov %i0, %o0 -; GNU32-NEXT: call __extendhfsf2 -; GNU32-NEXT: nop -; GNU32-NEXT: st %f0, [%fp+-20] -; GNU32-NEXT: ld [%fp+-20], %o0 +; GNU32-NEXT: st %f0, [%fp+-32] +; GNU32-NEXT: ld [%fp+-32], %o0 ; GNU32-NEXT: add %fp, -12, %o1 ; GNU32-NEXT: call sincosf ; GNU32-NEXT: add %fp, -16, %o2 -; GNU32-NEXT: ld [%fp+-28], %f0 ! 4-byte Folded Reload -; GNU32-NEXT: st %f0, [%fp+-24] -; GNU32-NEXT: ld [%fp+-24], %o0 -; GNU32-NEXT: add %fp, -4, %o1 +; GNU32-NEXT: call __extendhfsf2 +; GNU32-NEXT: mov %i0, %o0 +; GNU32-NEXT: st %f0, [%fp+-28] +; GNU32-NEXT: ld [%fp+-28], %o0 +; GNU32-NEXT: add %fp, -20, %o1 ; GNU32-NEXT: call sincosf -; GNU32-NEXT: add %fp, -8, %o2 +; GNU32-NEXT: add %fp, -24, %o2 +; GNU32-NEXT: ld [%fp+-16], %f0 +; GNU32-NEXT: st %f0, [%fp+-44] +; GNU32-NEXT: ld [%fp+-24], %f0 +; GNU32-NEXT: st %f0, [%fp+-36] ; GNU32-NEXT: ld [%fp+-12], %f0 -; GNU32-NEXT: ld [%fp+-4], %f1 -; GNU32-NEXT: ld [%fp+-16], %f2 -; GNU32-NEXT: ld [%fp+-8], %f3 +; GNU32-NEXT: st %f0, [%fp+-48] +; GNU32-NEXT: ld [%fp+-20], %f0 +; GNU32-NEXT: st %f0, [%fp+-40] +; GNU32-NEXT: call __truncsfhf2 +; GNU32-NEXT: ld [%fp+-44], %o0 +; GNU32-NEXT: sethi 63, %i0 +; GNU32-NEXT: or %i0, 1023, %i0 +; GNU32-NEXT: and %o0, %i0, %i4 +; GNU32-NEXT: call __truncsfhf2 +; GNU32-NEXT: ld [%fp+-36], %o0 +; GNU32-NEXT: and %o0, %i0, %i2 +; GNU32-NEXT: call __truncsfhf2 +; GNU32-NEXT: ld [%fp+-48], %o0 +; GNU32-NEXT: and %o0, %i0, %i1 +; GNU32-NEXT: call __truncsfhf2 +; GNU32-NEXT: ld [%fp+-40], %o0 +; GNU32-NEXT: and %o0, %i0, %g2 +; GNU32-NEXT: mov %g2, %i0 +; GNU32-NEXT: ! kill: def $i2 killed $i2 killed $i2_i3 ; GNU32-NEXT: ret -; GNU32-NEXT: restore +; GNU32-NEXT: restore %g0, %i4, %o3 ; ; GNU64-LABEL: test_sincos_v2f16: ; GNU64: ! %bb.0: -; GNU64-NEXT: save %sp, -208, %sp -; GNU64-NEXT: st %f1, [%fp+2023] ! 4-byte Folded Spill -; GNU64-NEXT: fmovs %f3, %f1 -; GNU64-NEXT: call __truncsfhf2 -; GNU64-NEXT: nop +; GNU64-NEXT: save %sp, -192, %sp ; GNU64-NEXT: call __extendhfsf2 -; GNU64-NEXT: nop -; GNU64-NEXT: st %f0, [%fp+2027] ! 4-byte Folded Spill -; GNU64-NEXT: call __truncsfhf2 -; GNU64-NEXT: ld [%fp+2023], %f1 -; GNU64-NEXT: call __extendhfsf2 -; GNU64-NEXT: nop +; GNU64-NEXT: srl %i0, 0, %o0 ; GNU64-NEXT: add %fp, 2035, %o1 ; GNU64-NEXT: add %fp, 2031, %o2 ; GNU64-NEXT: fmovs %f0, %f1 ; GNU64-NEXT: call sincosf ; GNU64-NEXT: nop +; GNU64-NEXT: call __extendhfsf2 +; GNU64-NEXT: srl %i1, 0, %o0 ; GNU64-NEXT: add %fp, 2043, %o1 ; GNU64-NEXT: add %fp, 2039, %o2 +; GNU64-NEXT: fmovs %f0, %f1 ; GNU64-NEXT: call sincosf -; GNU64-NEXT: ld [%fp+2027], %f1 -; GNU64-NEXT: ld [%fp+2035], %f0 +; GNU64-NEXT: nop +; GNU64-NEXT: call __truncsfhf2 +; GNU64-NEXT: ld [%fp+2035], %f1 +; GNU64-NEXT: mov %o0, %i0 +; GNU64-NEXT: call __truncsfhf2 ; GNU64-NEXT: ld [%fp+2043], %f1 -; GNU64-NEXT: ld [%fp+2031], %f2 -; GNU64-NEXT: ld [%fp+2039], %f3 +; GNU64-NEXT: mov %o0, %i1 +; GNU64-NEXT: call __truncsfhf2 +; GNU64-NEXT: ld [%fp+2031], %f1 +; GNU64-NEXT: mov %o0, %i2 +; GNU64-NEXT: call __truncsfhf2 +; GNU64-NEXT: ld [%fp+2039], %f1 ; GNU64-NEXT: ret -; GNU64-NEXT: restore +; GNU64-NEXT: restore %g0, %o0, %o3 %result = call { <2 x half>, <2 x half> } @llvm.sincos.v2f16(<2 x half> %a) ret { <2 x half>, <2 x half> } %result } From 4b94c08a57b2b026aa434ef69823d579d56cfbda Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 18 Aug 2025 14:01:41 -0500 Subject: [PATCH 21/27] [lldb] Relax the error message in TestProcessCrashInfo.py (#153653) The error message has been updated in macOS 26. Relax the error message to check the more generic "BUG IN CLIENT OF LIBMALLOC" rather than the error message that comes after. --- .../process_crash_info/TestProcessCrashInfo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lldb/test/API/functionalities/process_crash_info/TestProcessCrashInfo.py b/lldb/test/API/functionalities/process_crash_info/TestProcessCrashInfo.py index af05c2f3a0f6..4924937b4fe2 100644 --- a/lldb/test/API/functionalities/process_crash_info/TestProcessCrashInfo.py +++ b/lldb/test/API/functionalities/process_crash_info/TestProcessCrashInfo.py @@ -38,7 +38,7 @@ class PlatformProcessCrashInfoTestCase(TestBase): patterns=[ "Extended Crash Information", "Crash-Info Annotations", - "pointer being freed was not allocated", + "BUG IN CLIENT OF LIBMALLOC", ], ) @@ -67,7 +67,7 @@ class PlatformProcessCrashInfoTestCase(TestBase): self.assertTrue(crash_info.IsValid()) - self.assertIn("pointer being freed was not allocated", stream.GetData()) + self.assertIn("BUG IN CLIENT OF LIBMALLOC", stream.GetData()) # dyld leaves permanent crash_info records when testing on device. @skipIfDarwinEmbedded From d30fd562e8a45c90e8b256890100442b61e0dac8 Mon Sep 17 00:00:00 2001 From: Utkarsh Saxena Date: Mon, 18 Aug 2025 21:07:41 +0200 Subject: [PATCH 22/27] [LifetimeSafety] Enhance benchmark script for new sub analyses (#149577) Enhanced the lifetime safety analysis benchmark script with more detailed performance metrics and a new nested loop test case. This is a worst case for loan expiry analysis. ### What changed? - Added a new test case `nested_loops` that generates code with N levels of nested loops to test how analysis performance scales with loop nesting depth - Improved the trace file analysis to extract durations for sub-phases of the lifetime analysis (FactGenerator, LoanPropagation, ExpiredLoans) - Enhanced the markdown report generation to include: - Relative timing results as percentages of total Clang time - More detailed complexity analysis for each analysis phase Report # Lifetime Analysis Performance Report > Generated on: 2025-08-18 13:29:57 --- ## Test Case: Pointer Cycle in Loop **Timing Results:** | N (Input Size) | Total Time | Analysis Time (%) | Fact Generator (%) | Loan Propagation (%) | Expired Loans (%) | |:---------------|-----------:|------------------:|-------------------:|---------------------:|------------------:| | 10 | 10.75 ms | 24.61% | 0.00% | 24.38% | 0.00% | | 25 | 64.98 ms | 86.08% | 0.00% | 86.02% | 0.00% | | 50 | 709.37 ms | 98.53% | 0.00% | 98.51% | 0.00% | | 75 | 3.13 s | 99.63% | 0.00% | 99.63% | 0.00% | | 100 | 9.44 s | 99.85% | 0.00% | 99.84% | 0.00% | | 150 | 45.31 s | 99.96% | 0.00% | 99.96% | 0.00% | **Complexity Analysis:** | Analysis Phase | Complexity O(nk) | |:------------------|:--------------------------| | Total Analysis | O(n3.87 ± 0.01) | | FactGenerator | (Negligible) | | LoanPropagation | O(n3.87 ± 0.01) | | ExpiredLoans | (Negligible) | --- ## Test Case: CFG Merges **Timing Results:** | N (Input Size) | Total Time | Analysis Time (%) | Fact Generator (%) | Loan Propagation (%) | Expired Loans (%) | |:---------------|-----------:|------------------:|-------------------:|---------------------:|------------------:| | 10 | 8.54 ms | 0.00% | 0.00% | 0.00% | 0.00% | | 50 | 40.85 ms | 65.09% | 0.00% | 64.61% | 0.00% | | 100 | 207.70 ms | 93.58% | 0.00% | 93.46% | 0.00% | | 200 | 1.54 s | 98.82% | 0.00% | 98.78% | 0.00% | | 400 | 12.04 s | 99.72% | 0.00% | 99.71% | 0.01% | | 800 | 96.73 s | 99.94% | 0.00% | 99.94% | 0.00% | **Complexity Analysis:** | Analysis Phase | Complexity O(nk) | |:------------------|:--------------------------| | Total Analysis | O(n3.01 ± 0.00) | | FactGenerator | (Negligible) | | LoanPropagation | O(n3.01 ± 0.00) | | ExpiredLoans | (Negligible) | --- ## Test Case: Deeply Nested Loops **Timing Results:** | N (Input Size) | Total Time | Analysis Time (%) | Fact Generator (%) | Loan Propagation (%) | Expired Loans (%) | |:---------------|-----------:|------------------:|-------------------:|---------------------:|------------------:| | 10 | 8.25 ms | 0.00% | 0.00% | 0.00% | 0.00% | | 50 | 27.25 ms | 51.87% | 0.00% | 45.71% | 5.93% | | 100 | 113.42 ms | 82.48% | 0.00% | 72.74% | 9.62% | | 200 | 730.05 ms | 95.24% | 0.00% | 83.95% | 11.25% | | 400 | 5.40 s | 98.74% | 0.01% | 87.05% | 11.68% | | 800 | 41.86 s | 99.62% | 0.00% | 87.77% | 11.84% | **Complexity Analysis:** | Analysis Phase | Complexity O(nk) | |:------------------|:--------------------------| | Total Analysis | O(n2.97 ± 0.00) | | FactGenerator | (Negligible) | | LoanPropagation | O(n2.96 ± 0.00) | | ExpiredLoans | O(n2.97 ± 0.00) | --- --- .../test/Analysis/LifetimeSafety/benchmark.py | 221 +++++++++++++----- 1 file changed, 158 insertions(+), 63 deletions(-) diff --git a/clang/test/Analysis/LifetimeSafety/benchmark.py b/clang/test/Analysis/LifetimeSafety/benchmark.py index 9d5f36c51b9e..4421fe9a81e2 100644 --- a/clang/test/Analysis/LifetimeSafety/benchmark.py +++ b/clang/test/Analysis/LifetimeSafety/benchmark.py @@ -99,28 +99,84 @@ def generate_cpp_merge_test(n: int) -> str: return cpp_code -def analyze_trace_file(trace_path: str) -> tuple[float, float]: +def generate_cpp_nested_loop_test(n: int) -> str: """ - Parses the -ftime-trace JSON output to find durations. + Generates C++ code with N levels of nested loops. + This pattern tests how analysis performance scales with loop nesting depth, + which is a key factor in the complexity of dataflow analyses on structured + control flow. - Returns: - A tuple of (lifetime_analysis_duration_us, total_clang_duration_us). + Example (n=3): + struct MyObj { int id; ~MyObj() {} }; + void nested_loops_3() { + MyObj* p = nullptr; + for(int i0=0; i0<2; ++i0) { + MyObj s0; + p = &s0; + for(int i1=0; i1<2; ++i1) { + MyObj s1; + p = &s1; + for(int i2=0; i2<2; ++i2) { + MyObj s2; + p = &s2; + } + } + } + } """ - lifetime_duration = 0.0 - total_duration = 0.0 + if n <= 0: + return "// Nesting depth must be positive." + + cpp_code = "struct MyObj { int id; ~MyObj() {} };\n\n" + cpp_code += f"void nested_loops_{n}() {{\n" + cpp_code += " MyObj* p = nullptr;\n" + + for i in range(n): + indent = " " * (i + 1) + cpp_code += f"{indent}for(int i{i}=0; i{i}<2; ++i{i}) {{\n" + cpp_code += f"{indent} MyObj s{i}; p = &s{i};\n" + + for i in range(n - 1, -1, -1): + indent = " " * (i + 1) + cpp_code += f"{indent}}}\n" + + cpp_code += "}\n" + cpp_code += f"\nint main() {{ nested_loops_{n}(); return 0; }}\n" + return cpp_code + + +def analyze_trace_file(trace_path: str) -> dict: + """ + Parses the -ftime-trace JSON output to find durations for the lifetime + analysis and its sub-phases. + Returns a dictionary of durations in microseconds. + """ + durations = { + "lifetime_us": 0.0, + "total_us": 0.0, + "fact_gen_us": 0.0, + "loan_prop_us": 0.0, + "expired_loans_us": 0.0, + } + event_name_map = { + "LifetimeSafetyAnalysis": "lifetime_us", + "ExecuteCompiler": "total_us", + "FactGenerator": "fact_gen_us", + "LoanPropagation": "loan_prop_us", + "ExpiredLoans": "expired_loans_us", + } try: with open(trace_path, "r") as f: trace_data = json.load(f) for event in trace_data.get("traceEvents", []): - if event.get("name") == "LifetimeSafetyAnalysis": - lifetime_duration += float(event.get("dur", 0)) - if event.get("name") == "ExecuteCompiler": - total_duration += float(event.get("dur", 0)) - + event_name = event.get("name") + if event_name in event_name_map: + key = event_name_map[event_name] + durations[key] += float(event.get("dur", 0)) except (IOError, json.JSONDecodeError) as e: print(f"Error reading or parsing trace file {trace_path}: {e}", file=sys.stderr) - return 0.0, 0.0 - return lifetime_duration, total_duration + return {key: 0.0 for key in durations} + return durations def power_law(n, c, k): @@ -135,8 +191,29 @@ def human_readable_time(ms: float) -> str: return f"{ms:.2f} ms" +def calculate_complexity(n_data, y_data) -> tuple[float | None, float | None]: + """ + Calculates the exponent 'k' for the power law fit y = c * n^k. + Returns a tuple of (k, k_standard_error). + """ + try: + if len(n_data) < 3 or np.all(y_data < 1e-6) or np.var(y_data) < 1e-6: + return None, None + + non_zero_indices = y_data > 0 + if np.sum(non_zero_indices) < 3: + return None, None + + n_fit, y_fit = n_data[non_zero_indices], y_data[non_zero_indices] + popt, pcov = curve_fit(power_law, n_fit, y_fit, p0=[0, 1], maxfev=5000) + k_stderr = np.sqrt(np.diag(pcov))[1] + return popt[1], k_stderr + except (RuntimeError, ValueError): + return None, None + + def generate_markdown_report(results: dict) -> str: - """Generates a Markdown-formatted report from the benchmark results.""" + """Generates a concise, Markdown-formatted report from the benchmark results.""" report = [] timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z") report.append(f"# Lifetime Analysis Performance Report") @@ -146,54 +223,52 @@ def generate_markdown_report(results: dict) -> str: for test_name, data in results.items(): title = data["title"] report.append(f"## Test Case: {title}") - report.append("") + report.append("\n**Timing Results:**\n") # Table header - report.append("| N | Analysis Time | Total Clang Time |") - report.append("|:----|--------------:|-----------------:|") + report.append( + "| N (Input Size) | Total Time | Analysis Time (%) | Fact Generator (%) | Loan Propagation (%) | Expired Loans (%) |" + ) + report.append( + "|:---------------|-----------:|------------------:|-------------------:|---------------------:|------------------:|" + ) # Table rows n_data = np.array(data["n"]) - analysis_data = np.array(data["lifetime_ms"]) - total_data = np.array(data["total_ms"]) + total_ms_data = np.array(data["total_ms"]) for i in range(len(n_data)): - analysis_str = human_readable_time(analysis_data[i]) - total_str = human_readable_time(total_data[i]) - report.append(f"| {n_data[i]:<3} | {analysis_str:>13} | {total_str:>16} |") + total_t = total_ms_data[i] + if total_t < 1e-6: + total_t = 1.0 # Avoid division by zero - report.append("") + row = [ + f"| {n_data[i]:<14} |", + f"{human_readable_time(total_t):>10} |", + f"{data['lifetime_ms'][i] / total_t * 100:>17.2f}% |", + f"{data['fact_gen_ms'][i] / total_t * 100:>18.2f}% |", + f"{data['loan_prop_ms'][i] / total_t * 100:>20.2f}% |", + f"{data['expired_loans_ms'][i] / total_t * 100:>17.2f}% |", + ] + report.append(" ".join(row)) - # Complexity analysis - report.append(f"**Complexity Analysis:**") - try: - # Curve fitting requires at least 3 points - if len(n_data) < 3: - raise ValueError("Not enough data points to perform curve fitting.") + report.append("\n**Complexity Analysis:**\n") + report.append("| Analysis Phase | Complexity O(nk) |") + report.append("|:------------------|:--------------------------|") - popt, pcov = curve_fit( - power_law, n_data, analysis_data, p0=[0, 2], maxfev=5000 - ) - _, k = popt + analysis_phases = { + "Total Analysis": data["lifetime_ms"], + "FactGenerator": data["fact_gen_ms"], + "LoanPropagation": data["loan_prop_ms"], + "ExpiredLoans": data["expired_loans_ms"], + } - # Confidence Interval for k - alpha = 0.05 # 95% confidence - dof = max(0, len(n_data) - len(popt)) # degrees of freedom - t_val = t.ppf(1.0 - alpha / 2.0, dof) - # Standard error of the parameters - perr = np.sqrt(np.diag(pcov)) - k_stderr = perr[1] - k_ci_lower = k - t_val * k_stderr - k_ci_upper = k + t_val * k_stderr - - report.append( - f"- The performance for this case scales approx. as **O(n{k:.2f})**." - ) - report.append( - f"- **95% Confidence interval for exponent:** `[{k_ci_lower:.2f}, {k_ci_upper:.2f}]`." - ) - - except (RuntimeError, ValueError) as e: - report.append(f"- Could not determine a best-fit curve for the data: {e}") + for phase_name, y_data in analysis_phases.items(): + k, delta = calculate_complexity(n_data, np.array(y_data)) + if k is not None and delta is not None: + complexity_str = f"O(n{k:.2f} ± {delta:.2f})" + else: + complexity_str = "(Negligible)" + report.append(f"| {phase_name:<17} | {complexity_str:<25} |") report.append("\n---\n") @@ -202,7 +277,7 @@ def generate_markdown_report(results: dict) -> str: def run_single_test( clang_binary: str, output_dir: str, test_name: str, generator_func, n: int -) -> tuple[float, float]: +) -> dict: """Generates, compiles, and benchmarks a single test case.""" print(f"--- Running Test: {test_name.capitalize()} with N={n} ---") @@ -221,7 +296,8 @@ def run_single_test( "-o", "/dev/null", "-ftime-trace=" + trace_file, - "-Wexperimental-lifetime-safety", + "-Xclang", + "-fexperimental-lifetime-safety", "-std=c++17", source_file, ] @@ -231,11 +307,12 @@ def run_single_test( if result.returncode != 0: print(f"Compilation failed for N={n}!", file=sys.stderr) print(result.stderr, file=sys.stderr) - return 0.0, 0.0 + return {} - lifetime_us, total_us = analyze_trace_file(trace_file) - - return lifetime_us / 1000.0, total_us / 1000.0 + durations_us = analyze_trace_file(trace_file) + return { + key.replace("_us", "_ms"): value / 1000.0 for key, value in durations_us.items() + } if __name__ == "__main__": @@ -270,6 +347,12 @@ if __name__ == "__main__": "generator_func": generate_cpp_merge_test, "n_values": [10, 50, 100, 200, 400, 800], }, + { + "name": "nested_loops", + "title": "Deeply Nested Loops", + "generator_func": generate_cpp_nested_loop_test, + "n_values": [10, 50, 100, 200, 400, 800], + }, ] results = {} @@ -282,21 +365,28 @@ if __name__ == "__main__": "n": [], "lifetime_ms": [], "total_ms": [], + "fact_gen_ms": [], + "loan_prop_ms": [], + "expired_loans_ms": [], } for n in config["n_values"]: - lifetime_ms, total_ms = run_single_test( + durations_ms = run_single_test( args.clang_binary, args.output_dir, test_name, config["generator_func"], n, ) - if total_ms > 0: + if durations_ms: results[test_name]["n"].append(n) - results[test_name]["lifetime_ms"].append(lifetime_ms) - results[test_name]["total_ms"].append(total_ms) + for key, value in durations_ms.items(): + results[test_name][key].append(value) + print( - f" Total: {human_readable_time(total_ms)} | Analysis: {human_readable_time(lifetime_ms)}" + f" Total Analysis: {human_readable_time(durations_ms['lifetime_ms'])} | " + f"FactGen: {human_readable_time(durations_ms['fact_gen_ms'])} | " + f"LoanProp: {human_readable_time(durations_ms['loan_prop_ms'])} | " + f"ExpiredLoans: {human_readable_time(durations_ms['expired_loans_ms'])}" ) print("\n\n" + "=" * 80) @@ -305,3 +395,8 @@ if __name__ == "__main__": markdown_report = generate_markdown_report(results) print(markdown_report) + + report_filename = os.path.join(args.output_dir, "performance_report.md") + with open(report_filename, "w") as f: + f.write(markdown_report) + print(f"Report saved to: {report_filename}") From 1bb72170501b95afd8124c4026bf927385be9b47 Mon Sep 17 00:00:00 2001 From: Usama Hameed Date: Mon, 18 Aug 2025 12:08:45 -0700 Subject: [PATCH 23/27] [Sanitizers][Darwin][Test] The top few frames are inaccurate in UBSan. (#153899) XFailing until further investigation rdar://158303080 --- .../TestCases/Posix/dedup_token_length_test.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/dedup_token_length_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Posix/dedup_token_length_test.cpp index deedbba76cde..37bfee480617 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Posix/dedup_token_length_test.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/dedup_token_length_test.cpp @@ -10,6 +10,10 @@ // REQUIRES: stable-runtime +// rdar://158303080 top few frames are at times inaccurate in ubsan fast stack +// unwind on darwin +// XFAIL: (darwin && ubsan && (arm64-target-arch || arm64e-target-arch)) + // XFAIL: target={{.*netbsd.*}} && !asan volatile int *null = 0; From e7c2c80fa16644b8c4e47c75caffaea8bc20a30d Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Mon, 18 Aug 2025 12:13:16 -0700 Subject: [PATCH 24/27] [AMDGPU] Combine prng(undef) -> undef (#154160) --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 ++- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll | 9 ++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 64e68ab7d753..a28e272367c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4002,7 +4002,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_rsq_legacy: case Intrinsic::amdgcn_rsq_clamp: - case Intrinsic::amdgcn_tanh: { + case Intrinsic::amdgcn_tanh: + case Intrinsic::amdgcn_prng_b32: { // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted SDValue Src = N->getOperand(1); return Src.isUndef() ? Src : SDValue(); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll index 6a5dc8f8dd0a..2daf9c3b472f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll @@ -1,6 +1,6 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.prng.b32(i32) #0 @@ -29,6 +29,13 @@ define amdgpu_kernel void @prng_b32_constant_100(ptr addrspace(1) %out) #1 { ret void } +; GCN-LABEL: {{^}}prng_undef_i32: +; SDAG-NOT: v_prng_b32 +define amdgpu_kernel void @prng_undef_i32(ptr addrspace(1) %out) #1 { + %prng = call i32 @llvm.amdgcn.prng.b32(i32 undef) + store i32 %prng, ptr addrspace(1) %out, align 4 + ret void +} attributes #0 = { nounwind readnone } attributes #1 = { nounwind } From 3d6177c14b4dca7412d929ef364196a98403ef01 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Mon, 18 Aug 2025 12:13:31 -0700 Subject: [PATCH 25/27] [AMDGPU] Avoid setting op_sel_hi bits if there is matrix_b_scale. NFCI. (#154176) This is NFCI now as there is no matrix_b_scale without matrix_b_reuse, but technically this condition shall be here. --- llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index f3580842c6ff..61f673221739 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -389,6 +389,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) && // Matrix B format operand reuses op_sel_hi. !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_fmt) && + // Matrix B scale operand reuses op_sel_hi. + !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_scale) && // Matrix B reuse operand reuses op_sel_hi. !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) { Encoding |= getImplicitOpSelHiEncoding(Opcode); From 986d7aa675e957e0160aeb2f045a6abf1bf2082e Mon Sep 17 00:00:00 2001 From: Daniel Thornburgh Date: Mon, 18 Aug 2025 12:19:19 -0700 Subject: [PATCH 26/27] Bump ProtocolServerMCPTest timeout to 200ms (#154182) This should reduce flakes observed in the Fuchsia AArch64 Linux LLDB CI builders. --- lldb/unittests/ProtocolServer/ProtocolMCPServerTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/unittests/ProtocolServer/ProtocolMCPServerTest.cpp b/lldb/unittests/ProtocolServer/ProtocolMCPServerTest.cpp index 2ac40c41dd28..de2ae2313ecd 100644 --- a/lldb/unittests/ProtocolServer/ProtocolMCPServerTest.cpp +++ b/lldb/unittests/ProtocolServer/ProtocolMCPServerTest.cpp @@ -144,7 +144,7 @@ public: template void RunOnce(const std::function)> &callback, - std::chrono::milliseconds timeout = std::chrono::milliseconds(100)) { + std::chrono::milliseconds timeout = std::chrono::milliseconds(200)) { auto handle = m_transport_up->RegisterReadObject

( loop, [&](lldb_private::MainLoopBase &loop, llvm::Expected

message) { callback(std::move(message)); From 9403c2d64d63c16a09739d943eaa22b8e3499b7a Mon Sep 17 00:00:00 2001 From: Naveen Seth Hanig Date: Tue, 19 Aug 2025 00:51:08 +0530 Subject: [PATCH 27/27] Reland [clang][modules-driver] Add scanner to detect C++20 module presence (#153497) This patch is part of a series to support driver managed module builds for C++ named modules and Clang modules. This introduces a scanner that detects C++ named module usage early in the driver with only negligible overhead. For now, it is enabled only with the `-fmodules-driver` flag and serves solely diagnostic purposes. In the future, the scanner will be enabled for any (modules-driver compatible) compilation with two or more inputs, and will help the driver determine whether to implicitly enable the modules driver. Since the scanner adds very little overhead, we are also exploring enabling it for compilations with only a single input. This approach could allow us to detect `import std` usage in a single-file compilation, which would then activate the modules driver. For performance measurements on this, see https://github.com/naveen-seth/llvm-dev-cxx-modules-check-benchmark. RFC for driver managed module builds: https://discourse.llvm.org/t/rfc-modules-support-simple-c-20-modules-use-from-the-clang-driver-without-a-build-system This patch relands the reland (2d31fc8) for commit ded1426. The earlier reland failed due to a missing link dependency on `clangLex`. This reland fixes the issue by adding the link dependency after discussing it in the following RFC: https://discourse.llvm.org/t/rfc-driver-link-the-driver-against-clangdependencyscanning-clangast-clangfrontend-clangserialization-and-clanglex --- .../clang/Basic/DiagnosticDriverKinds.td | 7 + clang/include/clang/Basic/DiagnosticGroups.td | 1 + clang/include/clang/Driver/Driver.h | 32 +++ clang/include/clang/Driver/Options.td | 7 + .../clang/Lex/DependencyDirectivesScanner.h | 7 + clang/lib/Driver/CMakeLists.txt | 1 + clang/lib/Driver/Driver.cpp | 67 ++++++ clang/lib/Lex/DependencyDirectivesScanner.cpp | 50 +++++ ...ules-driver-cxx20-module-usage-scanner.cpp | 192 ++++++++++++++++++ 9 files changed, 364 insertions(+) create mode 100644 clang/test/Driver/modules-driver-cxx20-module-usage-scanner.cpp diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 0f17f4aa761e..6df8f9932f30 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -581,6 +581,13 @@ def err_drv_reduced_module_output_overrided : Warning< "please consider use '-fmodule-output=' to specify the output file for reduced BMI explicitly">, InGroup>; +def remark_found_cxx20_module_usage : Remark< + "found C++20 module usage in file '%0'">, + InGroup; +def remark_performing_driver_managed_module_build : Remark< + "performing driver managed module build">, + InGroup; + def warn_drv_delayed_template_parsing_after_cxx20 : Warning< "-fdelayed-template-parsing is deprecated after C++20">, InGroup>; diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 2edf4da43536..e29c4694fa5e 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -635,6 +635,7 @@ def ModuleConflict : DiagGroup<"module-conflict">; def ModuleFileExtension : DiagGroup<"module-file-extension">; def ModuleIncludeDirectiveTranslation : DiagGroup<"module-include-translation">; def ModuleMap : DiagGroup<"module-map">; +def ModulesDriver : DiagGroup<"modules-driver">; def RoundTripCC1Args : DiagGroup<"round-trip-cc1-args">; def NewlineEOF : DiagGroup<"newline-eof">; def Nullability : DiagGroup<"nullability">; diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h index 4d32552b7f85..b9b187ada8ad 100644 --- a/clang/include/clang/Driver/Driver.h +++ b/clang/include/clang/Driver/Driver.h @@ -512,6 +512,9 @@ public: /// BuildActions - Construct the list of actions to perform for the /// given arguments, which are only done for a single architecture. + /// If the compilation is an explicit module build, delegates to + /// BuildDriverManagedModuleBuildActions. Otherwise, BuildDefaultActions is + /// used. /// /// \param C - The compilation that is being built. /// \param Args - The input arguments. @@ -796,6 +799,35 @@ private: /// compilation based on which -f(no-)?lto(=.*)? option occurs last. void setLTOMode(const llvm::opt::ArgList &Args); + /// BuildDefaultActions - Constructs the list of actions to perform + /// for the provided arguments, which are only done for a single architecture. + /// + /// \param C - The compilation that is being built. + /// \param Args - The input arguments. + /// \param Actions - The list to store the resulting actions onto. + void BuildDefaultActions(Compilation &C, llvm::opt::DerivedArgList &Args, + const InputList &Inputs, ActionList &Actions) const; + + /// BuildDriverManagedModuleBuildActions - Performs a dependency + /// scan and constructs the list of actions to perform for dependency order + /// and the provided arguments. This is only done for a single a architecture. + /// + /// \param C - The compilation that is being built. + /// \param Args - The input arguments. + /// \param Actions - The list to store the resulting actions onto. + void BuildDriverManagedModuleBuildActions(Compilation &C, + llvm::opt::DerivedArgList &Args, + const InputList &Inputs, + ActionList &Actions) const; + + /// Scans the leading lines of the C++ source inputs to detect C++20 module + /// usage. + /// + /// \returns True if module usage is detected, false otherwise, or an error on + /// read failure. + llvm::ErrorOr + ScanInputsForCXX20ModulesUsage(const InputList &Inputs) const; + /// Retrieves a ToolChain for a particular \p Target triple. /// /// Will cache ToolChains for the life of the driver object, and create them diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 6a2f4575459b..06bff0bf3b4f 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3296,6 +3296,13 @@ defm modules_reduced_bmi : BoolOption<"f", "modules-reduced-bmi", PosFlag>; +def fmodules_driver : Flag<["-"], "fmodules-driver">, + Group, Visibility<[ClangOption]>, + HelpText<"Enable support for driver managed module builds (experimental)">; +def fno_modules_driver : Flag<["-"], "fno-modules-driver">, + Group, Visibility<[ClangOption]>, + HelpText<"Disable support for driver managed module builds (experimental)">; + def experimental_modules_reduced_bmi : Flag<["-"], "fexperimental-modules-reduced-bmi">, Group, Visibility<[ClangOption, CC1Option]>, Alias; diff --git a/clang/include/clang/Lex/DependencyDirectivesScanner.h b/clang/include/clang/Lex/DependencyDirectivesScanner.h index f9fec3998ca5..c0b742d652a0 100644 --- a/clang/include/clang/Lex/DependencyDirectivesScanner.h +++ b/clang/include/clang/Lex/DependencyDirectivesScanner.h @@ -135,6 +135,13 @@ void printDependencyDirectivesAsSource( ArrayRef Directives, llvm::raw_ostream &OS); +/// Scan an input source buffer for C++20 named module usage. +/// +/// \param Source The input source buffer. +/// +/// \returns true if any C++20 named modules related directive was found. +bool scanInputForCXX20ModulesUsage(StringRef Source); + /// Functor that returns the dependency directives for a given file. class DependencyDirectivesGetter { public: diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt index 45782cbd9d16..7c4f70b966c4 100644 --- a/clang/lib/Driver/CMakeLists.txt +++ b/clang/lib/Driver/CMakeLists.txt @@ -98,5 +98,6 @@ add_clang_library(clangDriver LINK_LIBS clangBasic + clangLex ${system_libs} ) diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 8c0bba938a09..d682ffc832c8 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -66,6 +66,7 @@ #include "clang/Driver/Tool.h" #include "clang/Driver/ToolChain.h" #include "clang/Driver/Types.h" +#include "clang/Lex/DependencyDirectivesScanner.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" @@ -4188,6 +4189,11 @@ void Driver::handleArguments(Compilation &C, DerivedArgList &Args, YcArg = nullptr; } + if (Args.hasArgNoClaim(options::OPT_fmodules_driver)) + // TODO: Check against all incompatible -fmodules-driver arguments + if (!ModulesModeCXX20 && !Args.hasArgNoClaim(options::OPT_fmodules)) + Args.eraseArg(options::OPT_fmodules_driver); + Arg *FinalPhaseArg; phases::ID FinalPhase = getFinalPhase(Args, &FinalPhaseArg); @@ -4314,6 +4320,33 @@ void Driver::handleArguments(Compilation &C, DerivedArgList &Args, } } +static bool hasCXXModuleInputType(const Driver::InputList &Inputs) { + const auto IsTypeCXXModule = [](const auto &Input) -> bool { + const auto TypeID = Input.first; + return (TypeID == types::TY_CXXModule); + }; + return llvm::any_of(Inputs, IsTypeCXXModule); +} + +llvm::ErrorOr +Driver::ScanInputsForCXX20ModulesUsage(const InputList &Inputs) const { + const auto CXXInputs = llvm::make_filter_range( + Inputs, [](const auto &Input) { return types::isCXX(Input.first); }); + for (const auto &Input : CXXInputs) { + StringRef Filename = Input.second->getSpelling(); + auto ErrOrBuffer = VFS->getBufferForFile(Filename); + if (!ErrOrBuffer) + return ErrOrBuffer.getError(); + const auto Buffer = std::move(*ErrOrBuffer); + + if (scanInputForCXX20ModulesUsage(Buffer->getBuffer())) { + Diags.Report(diag::remark_found_cxx20_module_usage) << Filename; + return true; + } + } + return false; +} + void Driver::BuildActions(Compilation &C, DerivedArgList &Args, const InputList &Inputs, ActionList &Actions) const { llvm::PrettyStackTraceString CrashInfo("Building compilation actions"); @@ -4325,6 +4358,33 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args, handleArguments(C, Args, Inputs, Actions); + if (Args.hasFlag(options::OPT_fmodules_driver, + options::OPT_fno_modules_driver, false)) { + // TODO: Move the logic for implicitly enabling explicit-module-builds out + // of -fmodules-driver once it is no longer experimental. + // Currently, this serves diagnostic purposes only. + bool UsesCXXModules = hasCXXModuleInputType(Inputs); + if (!UsesCXXModules) { + const auto ErrOrScanResult = ScanInputsForCXX20ModulesUsage(Inputs); + if (!ErrOrScanResult) { + Diags.Report(diag::err_cannot_open_file) + << ErrOrScanResult.getError().message(); + return; + } + UsesCXXModules = *ErrOrScanResult; + } + if (UsesCXXModules || Args.hasArg(options::OPT_fmodules)) + BuildDriverManagedModuleBuildActions(C, Args, Inputs, Actions); + return; + } + + BuildDefaultActions(C, Args, Inputs, Actions); +} + +void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args, + const InputList &Inputs, + ActionList &Actions) const { + bool UseNewOffloadingDriver = C.isOffloadingHostKind(Action::OFK_OpenMP) || C.isOffloadingHostKind(Action::OFK_SYCL) || @@ -4608,6 +4668,13 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args, Args.ClaimAllArgs(options::OPT_cl_ignored_Group); } +void Driver::BuildDriverManagedModuleBuildActions( + Compilation &C, llvm::opt::DerivedArgList &Args, const InputList &Inputs, + ActionList &Actions) const { + Diags.Report(diag::remark_performing_driver_managed_module_build); + return; +} + /// Returns the canonical name for the offloading architecture when using a HIP /// or CUDA architecture. static StringRef getCanonicalArchString(Compilation &C, diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp index 9ccff5e3342d..eee57c786442 100644 --- a/clang/lib/Lex/DependencyDirectivesScanner.cpp +++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp @@ -83,6 +83,8 @@ struct Scanner { /// \returns True on error. bool scan(SmallVectorImpl &Directives); + friend bool clang::scanInputForCXX20ModulesUsage(StringRef Source); + private: /// Lexes next token and advances \p First and the \p Lexer. [[nodiscard]] dependency_directives_scan::Token & @@ -1075,3 +1077,51 @@ void clang::printDependencyDirectivesAsSource( } } } + +static void skipUntilMaybeCXX20ModuleDirective(const char *&First, + const char *const End) { + assert(First <= End); + while (First != End) { + if (*First == '#') { + ++First; + skipToNewlineRaw(First, End); + } + skipWhitespace(First, End); + if (const auto Len = isEOL(First, End)) { + First += Len; + continue; + } + break; + } +} + +bool clang::scanInputForCXX20ModulesUsage(StringRef Source) { + const char *First = Source.begin(); + const char *const End = Source.end(); + skipUntilMaybeCXX20ModuleDirective(First, End); + if (First == End) + return false; + + // Check if the next token can even be a module directive before creating a + // full lexer. + if (!(*First == 'i' || *First == 'e' || *First == 'm')) + return false; + + llvm::SmallVector Tokens; + Scanner S(StringRef(First, End - First), Tokens, nullptr, SourceLocation()); + S.TheLexer.setParsingPreprocessorDirective(true); + if (S.lexModule(First, End)) + return false; + auto IsCXXNamedModuleDirective = [](const DirectiveWithTokens &D) { + switch (D.Kind) { + case dependency_directives_scan::cxx_module_decl: + case dependency_directives_scan::cxx_import_decl: + case dependency_directives_scan::cxx_export_module_decl: + case dependency_directives_scan::cxx_export_import_decl: + return true; + default: + return false; + } + }; + return llvm::any_of(S.DirsWithToks, IsCXXNamedModuleDirective); +} diff --git a/clang/test/Driver/modules-driver-cxx20-module-usage-scanner.cpp b/clang/test/Driver/modules-driver-cxx20-module-usage-scanner.cpp new file mode 100644 index 000000000000..a434587a7875 --- /dev/null +++ b/clang/test/Driver/modules-driver-cxx20-module-usage-scanner.cpp @@ -0,0 +1,192 @@ +// The driver never checks to implicitly enable the explicit module build +// support unless at least two input files are provided. +// To trigger the C++20 module usage check, we always pass a second dummy file +// as input. +// TODO: Remove -fmodules everywhere once implicitly enabled explicit module +// builds are supported. + +// RUN: split-file %s %t +//--- empty.cpp +// Nothing here + +//--- only-global.cpp +// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/only-global.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK1 +// CHECK1: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +module; + +//--- only-import.cpp +// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/only-import.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK2 +// CHECK2: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +import A; + +//--- only-export.cpp +// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/only-export.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK3 +// CHECK3: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +export module A; + +//--- leading-line-comment.cpp +// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/leading-line-comment.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK4 +// CHECK4: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +// My line comment +import A; + +//--- leading-block-comment1.cpp +// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/leading-block-comment1.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK5 +// CHECK5: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +/*My block comment */ +import A; + +//--- leading-block-comment2.cpp +// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/leading-block-comment2.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK6 +// CHECK6: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +/*My line comment */ import A; + +//--- inline-block-comment1.cpp +// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/leading-block-comment1.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK7 +// CHECK7: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +export/*a comment*/module/*another comment*/A; + +//--- inline-block-comment2.cpp +// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/leading-block-comment2.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK8 +// CHECK8: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +module/*a comment*/; + +//--- leading-directives.cpp +// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/leading-directives.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK9 +// CHECK9: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +#define A +#undef A +#if A +#ifdef A +#elifdef A +#elifndef A +#endif +#ifndef A +#elif A +#else +#endif +#endif +#pragma once; +#include +import m; + +//--- multiline-directive.cpp +// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/multiline-directive.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK10 +// CHECK10: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +#define MACRO(a, \ + b) \ + call((a), \ + (b) +import a; + +//--- leading-line-splice.cpp +// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/leading-line-splice.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK11 +// CHECK11: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +\ +module; + +//--- leading-line-splice-trailing-whitespace.cpp +// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/leading-line-splice-trailing-whitespace.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK12 +// CHECK12: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +// v This backslash has trailing whitespace. + \ +export module A; + +//--- comment-line-splice.cpp +// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/comment-line-splice.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --allow-empty --check-prefix=CHECK13 +// CHECK13-NOT: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +// My comment continues next-line!\ +import A; + +//--- comment-line-splice-trailing-whitespace.cpp +// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/comment-line-splice-trailing-whitespace.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --allow-empty --check-prefix=CHECK14 +// CHECK14-NOT: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +// My comment continues next-line! This backslash has trailing whitespace. -> \ +module; + +//--- line-splice-in-directive1.cpp +// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/line-splice-in-directive1.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK15 +// CHECK15: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] + +module\ +; + +//--- line-splice-in-directive2.cpp +// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/line-splice-in-directive2.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK16 +// CHECK16: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] + +export\ + module\ + A; + +//--- no-module-usage1.cpp +// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/no-module-usage1.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --allow-empty --check-prefix=CHECK17 +// CHECK17-NOT: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +auto main() -> int {} + +//--- no-module-usage2.cpp +// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/no-module-usage2.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --allow-empty --check-prefix=CHECK18 +// CHECK18-NOT: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +moduleStruct{}; + +//--- no-module-usage3.cpp +// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/no-module-usage3.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --allow-empty --check-prefix=CHECK19 +// CHECK19-NOT: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +export_struct{}; + +//--- no-module-usage-namespace-import.cpp +// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/no-module-usage-namespace-import.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --allow-empty --check-prefix=CHECK20 +// CHECK20-NOT: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +import::inner xi = {}; + +//--- no-module-usage-namespace-module.cpp +// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: %t/no-module-usage-namespace-module.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --allow-empty --check-prefix=CHECK21 +// CHECK21-NOT: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver] +module::inner yi = {}; + +// RUN: not %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \ +// RUN: imaginary-file.cpp %t/empty.cpp 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-NON-EXISTING-FILE-ERR +// CHECK-NON-EXISTING-FILE-ERR: clang: error: no such file or directory: 'imaginary-file.cpp'