Rana Pratap Reddy df9eb79970
[Clang][AMDGPU] Lower __amdgpu_texture_t to <8 x i32> instead of ptr adrspace(0) (#187774)
Fix the IR lowering for `__amdgpu_texture_t` to generate a single
256-bit load instead of a double indirection through a flat pointer.

Previously, `__amdgpu_texture_t` was lowered to `ptr addrspace(0)`
(64-bit flat pointer), which caused the double load and indirection.
With the same reproducer like #187697.

```c
#define TSHARP __constant uint *

// Old tsharp handling:
// #define LOAD_TSHARP(I) *(__constant uint8 *)I

#define LOAD_TSHARP(I) *(__constant __amdgpu_texture_t *)I

float4 test_image_load_1D(TSHARP i, int c) {
  return __builtin_amdgcn_image_load_1d_v4f32_i32(15, c, LOAD_TSHARP(i), 0, 0);
}
```
old output: 

```llvm
define hidden <4 x float> @test_image_load_1D(ptr addrspace(4) noundef readonly captures(none) %i, i32 noundef %c) local_unnamed_addr #0 {
entry:
  %0 = load ptr, ptr addrspace(4) %i, align 32, !tbaa !9
  %1 = addrspacecast ptr %0 to ptr addrspace(1)
  %tex.rsrc.val = load <8 x i32>, ptr addrspace(1) %1, align 32
  %2 = tail call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32 15, i32 %c, <8 x i32> %tex.rsrc.val, i32 0, i32 0)
  ret <4 x float> %2
}
```
This matches the old `__constant uint8 *` behavior. With this fix new
output is
```llvm
define hidden <4 x float> @test_image_load_1D(ptr addrspace(4) noundef readonly captures(none) %0, i32 noundef %1) local_unnamed_addr #0 {
  %3 = load <8 x i32>, ptr addrspace(4) %0, align 32, !tbaa !10
  %4 = tail call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32 15, i32 %1, <8 x i32> %3, i32 0, i32 0)
  ret <4 x float> %4
}
```

Fixes #187697
2026-03-21 22:21:12 +05:30
..