6 Commits

Author SHA1 Message Date
Michele Scuttari
67d0d7ac0a
[MLIR] Update pass declarations to new autogenerated files
The patch introduces the required changes to update the pass declarations and definitions to use the new autogenerated files and allow dropping the old infrastructure.

Reviewed By: mehdi_amini, rriddle

Differential Review: https://reviews.llvm.org/D132838
2022-08-31 12:28:45 +02:00
Michele Scuttari
039b969b32
Revert "[MLIR] Update pass declarations to new autogenerated files"
This reverts commit 2be8af8f0e0780901213b6fd3013a5268ddc3359.
2022-08-30 22:21:55 +02:00
Michele Scuttari
2be8af8f0e
[MLIR] Update pass declarations to new autogenerated files
The patch introduces the required changes to update the pass declarations and definitions to use the new autogenerated files and allow dropping the old infrastructure.

Reviewed By: mehdi_amini, rriddle

Differential Review: https://reviews.llvm.org/D132838
2022-08-30 21:56:31 +02:00
Christian Sigg
400fef081a Recommit: "[MLIR][NVVM] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration."
This change rolls bcfc0a9051014437b55ab932d9aca5ecdca6776b forward (i.e., reverting 369ce54bb302f209239b8ebc77ad824add9df089) with fixed CMakeLists.txt.
2022-06-05 09:11:43 +02:00
Mehdi Amini
369ce54bb3 Revert "[MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration."
This reverts commit bcfc0a9051014437b55ab932d9aca5ecdca6776b.

The build is broken with shared library enabled.
2022-06-04 08:35:45 +00:00
Christian Sigg
bcfc0a9051 [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.
This is correct for all values, i.e. the same as promoting the division to fp32 in the NVPTX backend. But it is faster (~10% in average, sometimes more) because:

- it performs less Newton iterations
- it avoids the slow path for e.g. denormals
- it allows reuse of the reciprocal for multiple divisions by the same divisor

Test program:
```
#include <stdio.h>
#include "cuda_fp16.h"

// This is a variant of CUDA's own __hdiv which is fast than hdiv_promote below
// and doesn't suffer from the perf cliff of div.rn.fp32 with 'special' values.
__device__ half hdiv_newton(half a, half b) {
  float fa = __half2float(a);
  float fb = __half2float(b);

  float rcp;
  asm("{rcp.approx.ftz.f32 %0, %1;\n}" : "=f"(rcp) : "f"(fb));

  float result = fa * rcp;
  auto exponent = reinterpret_cast<const unsigned&>(result) & 0x7f800000;
  if (exponent != 0 && exponent != 0x7f800000) {
    float err = __fmaf_rn(-fb, result, fa);
    result = __fmaf_rn(rcp, err, result);
  }

  return __float2half(result);
}

// Surprisingly, this is faster than CUDA's own __hdiv.
__device__ half hdiv_promote(half a, half b) {
  return __float2half(__half2float(a) / __half2float(b));
}

// This is an approximation that is accurate up to 1 ulp.
__device__ half hdiv_approx(half a, half b) {
  float fa = __half2float(a);
  float fb = __half2float(b);

  float result;
  asm("{div.approx.ftz.f32 %0, %1, %2;\n}" : "=f"(result) : "f"(fa), "f"(fb));
  return __float2half(result);
}

__global__ void CheckCorrectness() {
  int i = threadIdx.x + blockIdx.x * blockDim.x;
  half x = reinterpret_cast<const half&>(i);
  for (int j = 0; j < 65536; ++j) {
    half y = reinterpret_cast<const half&>(j);
    half d1 = hdiv_newton(x, y);
    half d2 = hdiv_promote(x, y);
    auto s1 = reinterpret_cast<const short&>(d1);
    auto s2 = reinterpret_cast<const short&>(d2);
    if (s1 != s2) {
      printf("%f (%u) / %f (%u), got %f (%hu), expected: %f (%hu)\n",
             __half2float(x), i, __half2float(y), j, __half2float(d1), s1,
             __half2float(d2), s2);
      //__trap();
    }
  }
}

__device__ half dst;

__global__ void ProfileBuiltin(half x) {
  #pragma unroll 1
  for (int i = 0; i < 10000000; ++i) {
    x = x / x;
  }
  dst = x;
}

__global__ void ProfilePromote(half x) {
  #pragma unroll 1
  for (int i = 0; i < 10000000; ++i) {
    x = hdiv_promote(x, x);
  }
  dst = x;
}

__global__ void ProfileNewton(half x) {
  #pragma unroll 1
  for (int i = 0; i < 10000000; ++i) {
    x = hdiv_newton(x, x);
  }
  dst = x;
}

__global__ void ProfileApprox(half x) {
  #pragma unroll 1
  for (int i = 0; i < 10000000; ++i) {
    x = hdiv_approx(x, x);
  }
  dst = x;
}

int main() {
  CheckCorrectness<<<256, 256>>>();
  half one = __float2half(1.0f);
  ProfileBuiltin<<<1, 1>>>(one);  // 1.001s
  ProfilePromote<<<1, 1>>>(one);  // 0.560s
  ProfileNewton<<<1, 1>>>(one);   // 0.508s
  ProfileApprox<<<1, 1>>>(one);   // 0.304s
  auto status = cudaDeviceSynchronize();
  printf("%s\n", cudaGetErrorString(status));
}
```

Reviewed By: herhut

Differential Revision: https://reviews.llvm.org/D126158
2022-06-04 08:03:29 +02:00