Johannes Doerfert 2641ed7d26
[OpenMP][FIX] Check for requirements early (#104836)
If we can't transform the region to SPMD, we should not wait till the
end to decide that. Other AAs might assume SPMD, and we did set the
constant initializer to indicate SPMD, but we did not change the code
properly.
2024-08-20 09:05:23 -07:00

4671 lines
336 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=AMDGPU
; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=NVPTX
; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -S -passes=openmp-opt -openmp-opt-disable-spmdization < %s | FileCheck %s --check-prefix=AMDGPU-DISABLED1
; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -S -passes=openmp-opt-postlink < %s | FileCheck %s --check-prefix=AMDGPU-DISABLED2
; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt -openmp-opt-disable-spmdization < %s | FileCheck %s --check-prefix=NVPTX-DISABLED1
; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt-postlink < %s | FileCheck %s --check-prefix=NVPTX-DISABLED2
;; void unknown(void);
;; [[omp::assume("ompx_spmd_amenable")]] void spmd_amenable(void);
;;
;; void sequential_loop() {
;; #pragma omp target teams
;; {
;; for (int i = 0; i < 100; ++i) {
;; #pragma omp parallel
;; {
;; unknown();
;; }
;; }
;; spmd_amenable();
;; }
;; }
;;
;; [[omp::assume("ompx_spmd_amenable")]] void use(__attribute__((noescape)) int *);
;;
;; void sequential_loop_to_stack_var() {
;; #pragma omp target teams
;; {
;; int x;
;; use(&x);
;; for (int i = 0; i < 100; ++i) {
;; #pragma omp parallel
;; {
;; unknown();
;; }
;; }
;; spmd_amenable();
;; }
;; }
;;
;; void sequential_loop_to_shared_var() {
;; #pragma omp target teams
;; {
;; int x;
;; for (int i = 0; i < 100; ++i) {
;; #pragma omp parallel
;; {
;; x++;
;; unknown();
;; }
;; }
;; spmd_amenable();
;; }
;; }
;;
;; void sequential_loop_to_shared_var_guarded() {
;; #pragma omp target teams
;; {
;; int x = 42;
;; for (int i = 0; i < 100; ++i) {
;; #pragma omp parallel
;; {
;; x++;
;; unknown();
;; }
;; }
;; spmd_amenable();
;; }
;; }
;;
;; void do_not_spmdize_target() {
;; #pragma omp target teams
;; {
;; // Incompatible parallel level, called both
;; // from parallel and target regions
;; unknown();
;; }
;; }
;;
;; void do_not_spmdize_task() {
;; #pragma omp target
;; {
;; #pragma omp task
;; spmd_amenable();
;; #pragma omp parallel
;; unknown();
;; }
;; }
%struct.ident_t = type { i32, i32, i32, i32, ptr }
%struct.kmp_task_t_with_privates = type { %struct.kmp_task_t }
%struct.kmp_task_t = type { ptr, ptr, i32, %union.kmp_cmplrdata_t, %union.kmp_cmplrdata_t }
%union.kmp_cmplrdata_t = type { ptr }
%struct.anon = type {}
%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
@__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
; AMDGPU-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
; AMDGPU-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
; AMDGPU-DISABLED: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
; AMDGPU-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
; AMDGPU-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
; AMDGPU-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
; AMDGPU-DISABLED: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
; AMDGPU-DISABLED: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
; NVPTX-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
; NVPTX-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
; NVPTX-DISABLED: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
; NVPTX-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
; NVPTX-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
; NVPTX-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
; NVPTX-DISABLED: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
; NVPTX-DISABLED: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
;.
; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
; AMDGPU: @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU: @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU: @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU: @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
; AMDGPU: @x_shared = internal addrspace(3) global [4 x i8] poison, align 4
; AMDGPU: @x_shared.1 = internal addrspace(3) global [4 x i8] poison, align 4
; AMDGPU: @__omp_outlined__9_wrapper.ID = private constant i8 undef
;.
; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
; NVPTX: @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX: @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX: @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX: @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
; NVPTX: @x_shared = internal addrspace(3) global [4 x i8] poison, align 4
; NVPTX: @x_shared1 = internal addrspace(3) global [4 x i8] poison, align 4
; NVPTX: @__omp_outlined__9_wrapper.ID = private constant i8 undef
;.
; AMDGPU-DISABLED1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
; AMDGPU-DISABLED1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
; AMDGPU-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED1: @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED1: @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED1: @x_shared = internal addrspace(3) global [4 x i8] poison, align 4
; AMDGPU-DISABLED1: @x_shared.1 = internal addrspace(3) global [4 x i8] poison, align 4
; AMDGPU-DISABLED1: @__omp_outlined__1_wrapper.ID = private constant i8 undef
; AMDGPU-DISABLED1: @__omp_outlined__3_wrapper.ID = private constant i8 undef
; AMDGPU-DISABLED1: @__omp_outlined__5_wrapper.ID = private constant i8 undef
; AMDGPU-DISABLED1: @__omp_outlined__7_wrapper.ID = private constant i8 undef
; AMDGPU-DISABLED1: @__omp_outlined__9_wrapper.ID = private constant i8 undef
;.
; AMDGPU-DISABLED2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
; AMDGPU-DISABLED2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
; AMDGPU-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED2: @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED2: @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; AMDGPU-DISABLED2: @x_shared = internal addrspace(3) global [4 x i8] poison, align 4
; AMDGPU-DISABLED2: @x_shared.1 = internal addrspace(3) global [4 x i8] poison, align 4
; AMDGPU-DISABLED2: @__omp_outlined__1_wrapper.ID = private constant i8 undef
; AMDGPU-DISABLED2: @__omp_outlined__3_wrapper.ID = private constant i8 undef
; AMDGPU-DISABLED2: @__omp_outlined__5_wrapper.ID = private constant i8 undef
; AMDGPU-DISABLED2: @__omp_outlined__7_wrapper.ID = private constant i8 undef
; AMDGPU-DISABLED2: @__omp_outlined__9_wrapper.ID = private constant i8 undef
;.
; NVPTX-DISABLED1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
; NVPTX-DISABLED1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
; NVPTX-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED1: @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED1: @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED1: @x_shared = internal addrspace(3) global [4 x i8] poison, align 4
; NVPTX-DISABLED1: @x_shared1 = internal addrspace(3) global [4 x i8] poison, align 4
; NVPTX-DISABLED1: @__omp_outlined__1_wrapper.ID = private constant i8 undef
; NVPTX-DISABLED1: @__omp_outlined__3_wrapper.ID = private constant i8 undef
; NVPTX-DISABLED1: @__omp_outlined__5_wrapper.ID = private constant i8 undef
; NVPTX-DISABLED1: @__omp_outlined__7_wrapper.ID = private constant i8 undef
; NVPTX-DISABLED1: @__omp_outlined__9_wrapper.ID = private constant i8 undef
;.
; NVPTX-DISABLED2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
; NVPTX-DISABLED2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
; NVPTX-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED2: @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED2: @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
; NVPTX-DISABLED2: @x_shared = internal addrspace(3) global [4 x i8] poison, align 4
; NVPTX-DISABLED2: @x_shared1 = internal addrspace(3) global [4 x i8] poison, align 4
; NVPTX-DISABLED2: @__omp_outlined__1_wrapper.ID = private constant i8 undef
; NVPTX-DISABLED2: @__omp_outlined__3_wrapper.ID = private constant i8 undef
; NVPTX-DISABLED2: @__omp_outlined__5_wrapper.ID = private constant i8 undef
; NVPTX-DISABLED2: @__omp_outlined__7_wrapper.ID = private constant i8 undef
; NVPTX-DISABLED2: @__omp_outlined__9_wrapper.ID = private constant i8 undef
;.
define weak void @__omp_offloading_fd02_2044372e_sequential_loop_l5() #0 {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
; AMDGPU-SAME: () #[[ATTR0:[0-9]+]] {
; AMDGPU-NEXT: call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
; AMDGPU-NEXT: ret void
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
; NVPTX-SAME: () #[[ATTR0:[0-9]+]] {
; NVPTX-NEXT: call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
; NVPTX-NEXT: ret void
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
; AMDGPU-DISABLED1-SAME: () #[[ATTR0:[0-9]+]] {
; AMDGPU-DISABLED1-NEXT: call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
; AMDGPU-DISABLED1-NEXT: ret void
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
; AMDGPU-DISABLED2-SAME: () #[[ATTR0:[0-9]+]] {
; AMDGPU-DISABLED2-NEXT: call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
; AMDGPU-DISABLED2-NEXT: ret void
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
; NVPTX-DISABLED1-SAME: () #[[ATTR0:[0-9]+]] {
; NVPTX-DISABLED1-NEXT: call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
; NVPTX-DISABLED1-NEXT: ret void
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
; NVPTX-DISABLED2-SAME: () #[[ATTR0:[0-9]+]] {
; NVPTX-DISABLED2-NEXT: call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
; NVPTX-DISABLED2-NEXT: ret void
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
; AMDGPU-DISABLED-SAME: () #[[ATTR0:[0-9]+]] {
; AMDGPU-DISABLED-NEXT: call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
; AMDGPU-DISABLED-NEXT: ret void
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
; NVPTX-DISABLED-SAME: () #[[ATTR0:[0-9]+]] {
; NVPTX-DISABLED-NEXT: call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
; NVPTX-DISABLED-NEXT: ret void
call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
ret void
}
define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
; AMDGPU-SAME: () #[[ATTR1:[0-9]+]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU: common.ret:
; AMDGPU-NEXT: ret void
; AMDGPU: user_code.entry:
; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
; AMDGPU-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-NEXT: br label [[COMMON_RET]]
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
; NVPTX-SAME: () #[[ATTR1:[0-9]+]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX: common.ret:
; NVPTX-NEXT: ret void
; NVPTX: user_code.entry:
; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
; NVPTX-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-NEXT: call void @__kmpc_target_deinit()
; NVPTX-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
; AMDGPU-DISABLED1-SAME: () #[[ATTR1:[0-9]+]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
; AMDGPU-DISABLED1-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED1-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED1: is_worker_check:
; AMDGPU-DISABLED1-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED1-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED1-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED1-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED1-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.begin:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED1-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED1-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED1-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.finished:
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: worker_state_machine.is_active.check:
; AMDGPU-DISABLED1-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED1-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.check1:
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED1: worker_state_machine.done.barrier:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED1: thread.user_code.check:
; AMDGPU-DISABLED1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED1: common.ret:
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: user_code.entry:
; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
; AMDGPU-DISABLED2-SAME: () #[[ATTR1:[0-9]+]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
; AMDGPU-DISABLED2-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED2-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED2: is_worker_check:
; AMDGPU-DISABLED2-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED2-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED2-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED2-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED2-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.begin:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED2-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED2-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED2-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.finished:
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: worker_state_machine.is_active.check:
; AMDGPU-DISABLED2-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED2-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.check1:
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED2: worker_state_machine.done.barrier:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED2: thread.user_code.check:
; AMDGPU-DISABLED2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED2: common.ret:
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: user_code.entry:
; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]]
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
; NVPTX-DISABLED1-SAME: () #[[ATTR1:[0-9]+]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
; NVPTX-DISABLED1-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED1-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED1: is_worker_check:
; NVPTX-DISABLED1-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED1-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED1-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED1-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED1-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED1: worker_state_machine.begin:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED1-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED1-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED1-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED1: worker_state_machine.finished:
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: worker_state_machine.is_active.check:
; NVPTX-DISABLED1-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED1-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.check1:
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED1: worker_state_machine.done.barrier:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED1: thread.user_code.check:
; NVPTX-DISABLED1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED1: common.ret:
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: user_code.entry:
; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]]
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
; NVPTX-DISABLED2-SAME: () #[[ATTR1:[0-9]+]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
; NVPTX-DISABLED2-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED2-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED2: is_worker_check:
; NVPTX-DISABLED2-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED2-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED2-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED2-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED2-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED2: worker_state_machine.begin:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED2-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED2-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED2-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED2: worker_state_machine.finished:
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: worker_state_machine.is_active.check:
; NVPTX-DISABLED2-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED2-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.check1:
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED2: worker_state_machine.done.barrier:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED2: thread.user_code.check:
; NVPTX-DISABLED2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED2: common.ret:
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: user_code.entry:
; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
; AMDGPU-DISABLED-SAME: () #[[ATTR1:[0-9]+]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment)
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED: is_worker_check:
; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED: worker_state_machine.begin:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED: worker_state_machine.finished:
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: worker_state_machine.is_active.check:
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__1_wrapper.ID
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute:
; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED: worker_state_machine.done.barrier:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED: thread.user_code.check:
; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED: common.ret:
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: user_code.entry:
; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]]
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
; NVPTX-DISABLED-SAME: () #[[ATTR1:[0-9]+]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment)
; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED: is_worker_check:
; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED: worker_state_machine.begin:
; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED: worker_state_machine.finished:
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: worker_state_machine.is_active.check:
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__1_wrapper.ID
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute:
; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED: worker_state_machine.done.barrier:
; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED: thread.user_code.check:
; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED: common.ret:
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: user_code.entry:
; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]]
entry:
%.zero.addr = alloca i32, align 4
%.threadid_temp. = alloca i32, align 4
%0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
%exec_user_code = icmp eq i32 %0, -1
br i1 %exec_user_code, label %user_code.entry, label %common.ret
common.ret: ; preds = %entry, %user_code.entry
ret void
user_code.entry: ; preds = %entry
%1 = call i32 @__kmpc_global_thread_num(ptr @1)
store i32 0, ptr %.zero.addr, align 4
store i32 %1, ptr %.threadid_temp., align 4, !tbaa !18
call void @__omp_outlined__(ptr %.threadid_temp., ptr %.zero.addr) #6
call void @__kmpc_target_deinit()
br label %common.ret
}
; Function Attrs: alwaysinline convergent norecurse nounwind
define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__
; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; AMDGPU-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU: for.cond:
; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU: for.cond.cleanup:
; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]]
; AMDGPU-NEXT: ret void
; AMDGPU: for.body:
; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__
; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; NVPTX-NEXT: br label [[FOR_COND:%.*]]
; NVPTX: for.cond:
; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX: for.cond.cleanup:
; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]]
; NVPTX-NEXT: ret void
; NVPTX: for.body:
; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__
; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU-DISABLED1: for.cond:
; AMDGPU-DISABLED1-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-DISABLED1-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-DISABLED1-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU-DISABLED1: for.cond.cleanup:
; AMDGPU-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]]
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: for.body:
; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; AMDGPU-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__
; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU-DISABLED2: for.cond:
; AMDGPU-DISABLED2-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-DISABLED2-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-DISABLED2-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU-DISABLED2: for.cond.cleanup:
; AMDGPU-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]]
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: for.body:
; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; AMDGPU-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__
; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; NVPTX-DISABLED1-NEXT: br label [[FOR_COND:%.*]]
; NVPTX-DISABLED1: for.cond:
; NVPTX-DISABLED1-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-DISABLED1-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-DISABLED1-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX-DISABLED1: for.cond.cleanup:
; NVPTX-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]]
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: for.body:
; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; NVPTX-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__
; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; NVPTX-DISABLED2-NEXT: br label [[FOR_COND:%.*]]
; NVPTX-DISABLED2: for.cond:
; NVPTX-DISABLED2-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-DISABLED2-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-DISABLED2-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX-DISABLED2: for.cond.cleanup:
; NVPTX-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]]
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: for.body:
; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; NVPTX-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__
; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU-DISABLED: for.cond:
; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU-DISABLED: for.cond.cleanup:
; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]]
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: for.body:
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__
; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]]
; NVPTX-DISABLED: for.cond:
; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX-DISABLED: for.cond.cleanup:
; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]]
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: for.body:
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
entry:
%captured_vars_addrs = alloca [0 x ptr], align 8
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp slt i32 %i.0, 100
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
call void @spmd_amenable() #10
ret void
for.body: ; preds = %for.cond
%0 = load i32, ptr %.global_tid., align 4, !tbaa !18
call void @__kmpc_parallel_51(ptr @1, i32 %0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr %captured_vars_addrs, i64 0)
%inc = add nsw i32 %i.0, 1
br label %for.cond, !llvm.loop !22
}
; Function Attrs: alwaysinline convergent norecurse nounwind
define internal void @__omp_outlined__1(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1
; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: call void @unknown() #[[ATTR8:[0-9]+]]
; AMDGPU-NEXT: ret void
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1
; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: call void @unknown() #[[ATTR8:[0-9]+]]
; NVPTX-NEXT: ret void
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__1
; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: call void @unknown() #[[ATTR8:[0-9]+]]
; AMDGPU-DISABLED1-NEXT: ret void
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__1
; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: call void @unknown() #[[ATTR8:[0-9]+]]
; AMDGPU-DISABLED2-NEXT: ret void
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__1
; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: call void @unknown() #[[ATTR8:[0-9]+]]
; NVPTX-DISABLED1-NEXT: ret void
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__1
; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: call void @unknown() #[[ATTR8:[0-9]+]]
; NVPTX-DISABLED2-NEXT: ret void
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1
; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8:[0-9]+]]
; AMDGPU-DISABLED-NEXT: ret void
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1
; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8:[0-9]+]]
; NVPTX-DISABLED-NEXT: ret void
entry:
call void @unknown() #11
ret void
}
; Function Attrs: convergent norecurse nounwind
define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-NEXT: ret void
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-NEXT: ret void
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
; AMDGPU-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: ret void
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
; AMDGPU-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: ret void
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
; NVPTX-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: ret void
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
; NVPTX-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: ret void
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED-NEXT: ret void
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED-NEXT: ret void
entry:
%.addr1 = alloca i32, align 4
%.zero.addr = alloca i32, align 4
%global_args = alloca ptr, align 8
store i32 %1, ptr %.addr1, align 4, !tbaa !18
store i32 0, ptr %.zero.addr, align 4
call void @__kmpc_get_shared_variables(ptr %global_args)
call void @__omp_outlined__1(ptr %.addr1, ptr %.zero.addr) #6
ret void
}
; Function Attrs: alwaysinline convergent norecurse nounwind
define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20() #0 {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
; AMDGPU-SAME: () #[[ATTR0]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU: common.ret:
; AMDGPU-NEXT: ret void
; AMDGPU: user_code.entry:
; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-NEXT: br label [[COMMON_RET]]
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
; NVPTX-SAME: () #[[ATTR0]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX: common.ret:
; NVPTX-NEXT: ret void
; NVPTX: user_code.entry:
; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; NVPTX-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-NEXT: call void @__kmpc_target_deinit()
; NVPTX-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
; AMDGPU-DISABLED1-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
; AMDGPU-DISABLED1-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED1-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED1: is_worker_check:
; AMDGPU-DISABLED1-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED1-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED1-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED1-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED1-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.begin:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED1-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED1-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED1-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.finished:
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: worker_state_machine.is_active.check:
; AMDGPU-DISABLED1-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED1-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.check1:
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED1: worker_state_machine.done.barrier:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED1: thread.user_code.check:
; AMDGPU-DISABLED1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED1: common.ret:
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: user_code.entry:
; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
; AMDGPU-DISABLED2-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
; AMDGPU-DISABLED2-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED2-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED2: is_worker_check:
; AMDGPU-DISABLED2-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED2-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED2-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED2-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED2-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.begin:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED2-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED2-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED2-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.finished:
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: worker_state_machine.is_active.check:
; AMDGPU-DISABLED2-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED2-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.check1:
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED2: worker_state_machine.done.barrier:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED2: thread.user_code.check:
; AMDGPU-DISABLED2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED2: common.ret:
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: user_code.entry:
; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]]
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
; NVPTX-DISABLED1-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
; NVPTX-DISABLED1-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED1-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED1: is_worker_check:
; NVPTX-DISABLED1-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED1-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED1-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED1-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED1-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED1: worker_state_machine.begin:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED1-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED1-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED1-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED1: worker_state_machine.finished:
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: worker_state_machine.is_active.check:
; NVPTX-DISABLED1-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED1-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.check1:
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED1: worker_state_machine.done.barrier:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED1: thread.user_code.check:
; NVPTX-DISABLED1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED1: common.ret:
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: user_code.entry:
; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]]
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
; NVPTX-DISABLED2-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
; NVPTX-DISABLED2-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED2-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED2: is_worker_check:
; NVPTX-DISABLED2-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED2-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED2-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED2-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED2-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED2: worker_state_machine.begin:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED2-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED2-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED2-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED2: worker_state_machine.finished:
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: worker_state_machine.is_active.check:
; NVPTX-DISABLED2-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED2-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.check1:
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED2: worker_state_machine.done.barrier:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED2: thread.user_code.check:
; NVPTX-DISABLED2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED2: common.ret:
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: user_code.entry:
; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment)
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED: is_worker_check:
; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED: worker_state_machine.begin:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED: worker_state_machine.finished:
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: worker_state_machine.is_active.check:
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__3_wrapper.ID
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute:
; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED: worker_state_machine.done.barrier:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED: thread.user_code.check:
; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED: common.ret:
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: user_code.entry:
; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]]
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment)
; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED: is_worker_check:
; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED: worker_state_machine.begin:
; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED: worker_state_machine.finished:
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: worker_state_machine.is_active.check:
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__3_wrapper.ID
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute:
; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED: worker_state_machine.done.barrier:
; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED: thread.user_code.check:
; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED: common.ret:
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: user_code.entry:
; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]]
entry:
%.zero.addr = alloca i32, align 4
%.threadid_temp. = alloca i32, align 4
%0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
%exec_user_code = icmp eq i32 %0, -1
br i1 %exec_user_code, label %user_code.entry, label %common.ret
common.ret: ; preds = %entry, %user_code.entry
ret void
user_code.entry: ; preds = %entry
%1 = call i32 @__kmpc_global_thread_num(ptr @1)
store i32 0, ptr %.zero.addr, align 4
store i32 %1, ptr %.threadid_temp., align 4, !tbaa !18
call void @__omp_outlined__2(ptr %.threadid_temp., ptr %.zero.addr) #6
call void @__kmpc_target_deinit()
br label %common.ret
}
; Function Attrs: alwaysinline convergent norecurse nounwind
define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__2
; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[X_H2S:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; AMDGPU-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast ptr addrspace(5) [[X_H2S]] to ptr
; AMDGPU-NEXT: call void @use(ptr nocapture [[MALLOC_CAST]]) #[[ATTR7]]
; AMDGPU-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU: for.cond:
; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU: for.cond.cleanup:
; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-NEXT: ret void
; AMDGPU: for.body:
; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__2
; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[X_H2S:%.*]] = alloca i8, i64 4, align 4
; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; NVPTX-NEXT: call void @use(ptr nocapture [[X_H2S]]) #[[ATTR7]]
; NVPTX-NEXT: br label [[FOR_COND:%.*]]
; NVPTX: for.cond:
; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX: for.cond.cleanup:
; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-NEXT: ret void
; NVPTX: for.body:
; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__2
; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[X_H2S:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
; AMDGPU-DISABLED1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; AMDGPU-DISABLED1-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast ptr addrspace(5) [[X_H2S]] to ptr
; AMDGPU-DISABLED1-NEXT: call void @use(ptr nocapture [[MALLOC_CAST]]) #[[ATTR7]]
; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU-DISABLED1: for.cond:
; AMDGPU-DISABLED1-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-DISABLED1-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-DISABLED1-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU-DISABLED1: for.cond.cleanup:
; AMDGPU-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: for.body:
; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; AMDGPU-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__2
; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[X_H2S:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
; AMDGPU-DISABLED2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; AMDGPU-DISABLED2-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast ptr addrspace(5) [[X_H2S]] to ptr
; AMDGPU-DISABLED2-NEXT: call void @use(ptr nocapture [[MALLOC_CAST]]) #[[ATTR7]]
; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU-DISABLED2: for.cond:
; AMDGPU-DISABLED2-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-DISABLED2-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-DISABLED2-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU-DISABLED2: for.cond.cleanup:
; AMDGPU-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: for.body:
; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; AMDGPU-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__2
; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[X_H2S:%.*]] = alloca i8, i64 4, align 4
; NVPTX-DISABLED1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; NVPTX-DISABLED1-NEXT: call void @use(ptr nocapture [[X_H2S]]) #[[ATTR7]]
; NVPTX-DISABLED1-NEXT: br label [[FOR_COND:%.*]]
; NVPTX-DISABLED1: for.cond:
; NVPTX-DISABLED1-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-DISABLED1-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-DISABLED1-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX-DISABLED1: for.cond.cleanup:
; NVPTX-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: for.body:
; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; NVPTX-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__2
; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[X_H2S:%.*]] = alloca i8, i64 4, align 4
; NVPTX-DISABLED2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; NVPTX-DISABLED2-NEXT: call void @use(ptr nocapture [[X_H2S]]) #[[ATTR7]]
; NVPTX-DISABLED2-NEXT: br label [[FOR_COND:%.*]]
; NVPTX-DISABLED2: for.cond:
; NVPTX-DISABLED2-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-DISABLED2-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-DISABLED2-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX-DISABLED2: for.cond.cleanup:
; NVPTX-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: for.body:
; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; NVPTX-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2
; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[X_H2S:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; AMDGPU-DISABLED-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast ptr addrspace(5) [[X_H2S]] to ptr
; AMDGPU-DISABLED-NEXT: call void @use(ptr nocapture [[MALLOC_CAST]]) #[[ATTR7]]
; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU-DISABLED: for.cond:
; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU-DISABLED: for.cond.cleanup:
; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: for.body:
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2
; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[X_H2S:%.*]] = alloca i8, i64 4, align 4
; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; NVPTX-DISABLED-NEXT: call void @use(ptr nocapture [[X_H2S]]) #[[ATTR7]]
; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]]
; NVPTX-DISABLED: for.cond:
; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX-DISABLED: for.cond.cleanup:
; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: for.body:
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
entry:
%captured_vars_addrs = alloca [0 x ptr], align 8
%x = call align 4 ptr @__kmpc_alloc_shared(i64 4)
call void @use(ptr nocapture %x) #10
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp slt i32 %i.0, 100
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
call void @spmd_amenable() #10
call void @__kmpc_free_shared(ptr %x, i64 4)
ret void
for.body: ; preds = %for.cond
%0 = load i32, ptr %.global_tid., align 4, !tbaa !18
call void @__kmpc_parallel_51(ptr @1, i32 %0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr %captured_vars_addrs, i64 0)
%inc = add nsw i32 %i.0, 1
br label %for.cond, !llvm.loop !25
}
; Function Attrs: alwaysinline convergent norecurse nounwind
define internal void @__omp_outlined__3(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3
; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-NEXT: ret void
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3
; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-NEXT: ret void
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__3
; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-DISABLED1-NEXT: ret void
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__3
; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-DISABLED2-NEXT: ret void
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__3
; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-DISABLED1-NEXT: ret void
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__3
; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-DISABLED2-NEXT: ret void
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3
; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-DISABLED-NEXT: ret void
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3
; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-DISABLED-NEXT: ret void
entry:
call void @unknown() #11
ret void
}
; Function Attrs: convergent norecurse nounwind
define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-NEXT: ret void
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-NEXT: ret void
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
; AMDGPU-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: ret void
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
; AMDGPU-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: ret void
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
; NVPTX-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: ret void
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
; NVPTX-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: ret void
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED-NEXT: ret void
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED-NEXT: ret void
entry:
%.addr1 = alloca i32, align 4
%.zero.addr = alloca i32, align 4
%global_args = alloca ptr, align 8
store i32 %1, ptr %.addr1, align 4, !tbaa !18
store i32 0, ptr %.zero.addr, align 4
call void @__kmpc_get_shared_variables(ptr %global_args)
call void @__omp_outlined__3(ptr %.addr1, ptr %.zero.addr) #6
ret void
}
; Function Attrs: alwaysinline convergent norecurse nounwind
define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35() #0 {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
; AMDGPU-SAME: () #[[ATTR0]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU: common.ret:
; AMDGPU-NEXT: ret void
; AMDGPU: user_code.entry:
; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-NEXT: br label [[COMMON_RET]]
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
; NVPTX-SAME: () #[[ATTR0]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX: common.ret:
; NVPTX-NEXT: ret void
; NVPTX: user_code.entry:
; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; NVPTX-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-NEXT: call void @__kmpc_target_deinit()
; NVPTX-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
; AMDGPU-DISABLED1-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
; AMDGPU-DISABLED1-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED1-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED1: is_worker_check:
; AMDGPU-DISABLED1-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED1-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED1-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED1-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED1-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.begin:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED1-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED1-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED1-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.finished:
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: worker_state_machine.is_active.check:
; AMDGPU-DISABLED1-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED1-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.check1:
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED1: worker_state_machine.done.barrier:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED1: thread.user_code.check:
; AMDGPU-DISABLED1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED1: common.ret:
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: user_code.entry:
; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
; AMDGPU-DISABLED2-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
; AMDGPU-DISABLED2-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED2-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED2: is_worker_check:
; AMDGPU-DISABLED2-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED2-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED2-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED2-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED2-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.begin:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED2-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED2-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED2-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.finished:
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: worker_state_machine.is_active.check:
; AMDGPU-DISABLED2-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED2-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.check1:
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED2: worker_state_machine.done.barrier:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED2: thread.user_code.check:
; AMDGPU-DISABLED2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED2: common.ret:
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: user_code.entry:
; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]]
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
; NVPTX-DISABLED1-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
; NVPTX-DISABLED1-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED1-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED1: is_worker_check:
; NVPTX-DISABLED1-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED1-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED1-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED1-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED1-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED1: worker_state_machine.begin:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED1-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED1-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED1-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED1: worker_state_machine.finished:
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: worker_state_machine.is_active.check:
; NVPTX-DISABLED1-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED1-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.check1:
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED1: worker_state_machine.done.barrier:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED1: thread.user_code.check:
; NVPTX-DISABLED1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED1: common.ret:
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: user_code.entry:
; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]]
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
; NVPTX-DISABLED2-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
; NVPTX-DISABLED2-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED2-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED2: is_worker_check:
; NVPTX-DISABLED2-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED2-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED2-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED2-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED2-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED2: worker_state_machine.begin:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED2-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED2-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED2-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED2: worker_state_machine.finished:
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: worker_state_machine.is_active.check:
; NVPTX-DISABLED2-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED2-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.check1:
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED2: worker_state_machine.done.barrier:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED2: thread.user_code.check:
; NVPTX-DISABLED2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED2: common.ret:
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: user_code.entry:
; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment)
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED: is_worker_check:
; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED: worker_state_machine.begin:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED: worker_state_machine.finished:
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: worker_state_machine.is_active.check:
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__5_wrapper.ID
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute:
; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED: worker_state_machine.done.barrier:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED: thread.user_code.check:
; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED: common.ret:
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: user_code.entry:
; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]]
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment)
; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED: is_worker_check:
; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED: worker_state_machine.begin:
; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED: worker_state_machine.finished:
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: worker_state_machine.is_active.check:
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__5_wrapper.ID
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute:
; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED: worker_state_machine.done.barrier:
; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED: thread.user_code.check:
; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED: common.ret:
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: user_code.entry:
; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]]
entry:
%.zero.addr = alloca i32, align 4
%.threadid_temp. = alloca i32, align 4
%0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
%exec_user_code = icmp eq i32 %0, -1
br i1 %exec_user_code, label %user_code.entry, label %common.ret
common.ret: ; preds = %entry, %user_code.entry
ret void
user_code.entry: ; preds = %entry
%1 = call i32 @__kmpc_global_thread_num(ptr @1)
store i32 0, ptr %.zero.addr, align 4
store i32 %1, ptr %.threadid_temp., align 4, !tbaa !18
call void @__omp_outlined__4(ptr %.threadid_temp., ptr %.zero.addr) #6
call void @__kmpc_target_deinit()
br label %common.ret
}
; Function Attrs: alwaysinline convergent norecurse nounwind
define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__4
; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; AMDGPU-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU: for.cond:
; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU: for.cond.cleanup:
; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-NEXT: ret void
; AMDGPU: for.body:
; AMDGPU-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__4
; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; NVPTX-NEXT: br label [[FOR_COND:%.*]]
; NVPTX: for.cond:
; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX: for.cond.cleanup:
; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-NEXT: ret void
; NVPTX: for.body:
; NVPTX-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__4
; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU-DISABLED1: for.cond:
; AMDGPU-DISABLED1-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-DISABLED1-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-DISABLED1-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU-DISABLED1: for.cond.cleanup:
; AMDGPU-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: for.body:
; AMDGPU-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; AMDGPU-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__4
; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU-DISABLED2: for.cond:
; AMDGPU-DISABLED2-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-DISABLED2-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-DISABLED2-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU-DISABLED2: for.cond.cleanup:
; AMDGPU-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: for.body:
; AMDGPU-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; AMDGPU-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__4
; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; NVPTX-DISABLED1-NEXT: br label [[FOR_COND:%.*]]
; NVPTX-DISABLED1: for.cond:
; NVPTX-DISABLED1-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-DISABLED1-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-DISABLED1-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX-DISABLED1: for.cond.cleanup:
; NVPTX-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: for.body:
; NVPTX-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; NVPTX-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__4
; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; NVPTX-DISABLED2-NEXT: br label [[FOR_COND:%.*]]
; NVPTX-DISABLED2: for.cond:
; NVPTX-DISABLED2-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-DISABLED2-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-DISABLED2-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX-DISABLED2: for.cond.cleanup:
; NVPTX-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: for.body:
; NVPTX-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; NVPTX-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4
; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU-DISABLED: for.cond:
; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU-DISABLED: for.cond.cleanup:
; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: for.body:
; AMDGPU-DISABLED-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4
; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]]
; NVPTX-DISABLED: for.cond:
; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX-DISABLED: for.cond.cleanup:
; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: for.body:
; NVPTX-DISABLED-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
entry:
%captured_vars_addrs = alloca [1 x ptr], align 8
%x = call align 4 ptr @__kmpc_alloc_shared(i64 4)
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp slt i32 %i.0, 100
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
call void @spmd_amenable() #10
call void @__kmpc_free_shared(ptr %x, i64 4)
ret void
for.body: ; preds = %for.cond
store ptr %x, ptr %captured_vars_addrs, align 8, !tbaa !26
%0 = load i32, ptr %.global_tid., align 4, !tbaa !18
call void @__kmpc_parallel_51(ptr @1, i32 %0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr %captured_vars_addrs, i64 1)
%inc = add nsw i32 %i.0, 1
br label %for.cond, !llvm.loop !28
}
; Function Attrs: alwaysinline convergent norecurse nounwind
define internal void @__omp_outlined__5(ptr noalias %.global_tid., ptr noalias %.bound_tid., ptr nonnull align 4 dereferenceable(4) %x) {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5
; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; AMDGPU-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-NEXT: ret void
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5
; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; NVPTX-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-NEXT: ret void
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__5
; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; AMDGPU-DISABLED1-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED1-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-DISABLED1-NEXT: ret void
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__5
; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; AMDGPU-DISABLED2-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED2-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-DISABLED2-NEXT: ret void
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__5
; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; NVPTX-DISABLED1-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED1-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-DISABLED1-NEXT: ret void
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__5
; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; NVPTX-DISABLED2-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED2-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-DISABLED2-NEXT: ret void
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5
; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; AMDGPU-DISABLED-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-DISABLED-NEXT: ret void
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5
; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; NVPTX-DISABLED-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-DISABLED-NEXT: ret void
entry:
%0 = load i32, ptr %x, align 4, !tbaa !18
%inc = add nsw i32 %0, 1
store i32 %inc, ptr %x, align 4, !tbaa !18
call void @unknown() #11
ret void
}
; Function Attrs: convergent norecurse nounwind
define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; AMDGPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; AMDGPU-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; AMDGPU-NEXT: ret void
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; NVPTX-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; NVPTX-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; NVPTX-NEXT: ret void
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
; AMDGPU-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: ret void
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
; AMDGPU-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: ret void
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
; NVPTX-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: ret void
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
; NVPTX-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: ret void
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; AMDGPU-DISABLED-NEXT: ret void
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; NVPTX-DISABLED-NEXT: ret void
entry:
%.addr1 = alloca i32, align 4
%.zero.addr = alloca i32, align 4
%global_args = alloca ptr, align 8
store i32 %1, ptr %.addr1, align 4, !tbaa !18
store i32 0, ptr %.zero.addr, align 4
call void @__kmpc_get_shared_variables(ptr %global_args)
%2 = load ptr, ptr %global_args, align 8
%3 = load ptr, ptr %2, align 8, !tbaa !26
call void @__omp_outlined__5(ptr %.addr1, ptr %.zero.addr, ptr %3) #6
ret void
}
; Function Attrs: alwaysinline convergent norecurse nounwind
define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50() #0 {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
; AMDGPU-SAME: () #[[ATTR0]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU: common.ret:
; AMDGPU-NEXT: ret void
; AMDGPU: user_code.entry:
; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-NEXT: br label [[COMMON_RET]]
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
; NVPTX-SAME: () #[[ATTR0]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX: common.ret:
; NVPTX-NEXT: ret void
; NVPTX: user_code.entry:
; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; NVPTX-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-NEXT: call void @__kmpc_target_deinit()
; NVPTX-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
; AMDGPU-DISABLED1-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
; AMDGPU-DISABLED1-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED1-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED1: is_worker_check:
; AMDGPU-DISABLED1-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED1-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED1-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED1-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED1-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.begin:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED1-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED1-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED1-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.finished:
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: worker_state_machine.is_active.check:
; AMDGPU-DISABLED1-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED1-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.check1:
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED1: worker_state_machine.done.barrier:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED1: thread.user_code.check:
; AMDGPU-DISABLED1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED1: common.ret:
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: user_code.entry:
; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
; AMDGPU-DISABLED2-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
; AMDGPU-DISABLED2-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED2-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED2: is_worker_check:
; AMDGPU-DISABLED2-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED2-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED2-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED2-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED2-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.begin:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED2-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED2-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED2-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.finished:
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: worker_state_machine.is_active.check:
; AMDGPU-DISABLED2-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED2-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.check1:
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED2: worker_state_machine.done.barrier:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED2: thread.user_code.check:
; AMDGPU-DISABLED2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED2: common.ret:
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: user_code.entry:
; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]]
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
; NVPTX-DISABLED1-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
; NVPTX-DISABLED1-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED1-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED1: is_worker_check:
; NVPTX-DISABLED1-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED1-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED1-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED1-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED1-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED1: worker_state_machine.begin:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED1-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED1-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED1-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED1: worker_state_machine.finished:
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: worker_state_machine.is_active.check:
; NVPTX-DISABLED1-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED1-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.check1:
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED1: worker_state_machine.done.barrier:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED1: thread.user_code.check:
; NVPTX-DISABLED1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED1: common.ret:
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: user_code.entry:
; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]]
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
; NVPTX-DISABLED2-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
; NVPTX-DISABLED2-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED2-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED2: is_worker_check:
; NVPTX-DISABLED2-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED2-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED2-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED2-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED2-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED2: worker_state_machine.begin:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED2-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED2-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED2-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED2: worker_state_machine.finished:
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: worker_state_machine.is_active.check:
; NVPTX-DISABLED2-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED2-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.check1:
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED2: worker_state_machine.done.barrier:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED2: thread.user_code.check:
; NVPTX-DISABLED2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED2: common.ret:
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: user_code.entry:
; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment)
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED: is_worker_check:
; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED: worker_state_machine.begin:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED: worker_state_machine.finished:
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: worker_state_machine.is_active.check:
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__7_wrapper.ID
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute:
; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED: worker_state_machine.done.barrier:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED: thread.user_code.check:
; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED: common.ret:
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: user_code.entry:
; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]]
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment)
; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED: is_worker_check:
; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED: worker_state_machine.begin:
; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED: worker_state_machine.finished:
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: worker_state_machine.is_active.check:
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__7_wrapper.ID
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute:
; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED: worker_state_machine.done.barrier:
; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED: thread.user_code.check:
; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED: common.ret:
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: user_code.entry:
; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]]
entry:
%.zero.addr = alloca i32, align 4
%.threadid_temp. = alloca i32, align 4
%0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
%exec_user_code = icmp eq i32 %0, -1
br i1 %exec_user_code, label %user_code.entry, label %common.ret
common.ret: ; preds = %entry, %user_code.entry
ret void
user_code.entry: ; preds = %entry
%1 = call i32 @__kmpc_global_thread_num(ptr @1)
store i32 0, ptr %.zero.addr, align 4
store i32 %1, ptr %.threadid_temp., align 4, !tbaa !18
call void @__omp_outlined__6(ptr %.threadid_temp., ptr %.zero.addr) #6
call void @__kmpc_target_deinit()
br label %common.ret
}
; Function Attrs: alwaysinline convergent norecurse nounwind
define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__6
; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; AMDGPU-NEXT: br label [[REGION_CHECK_TID:%.*]]
; AMDGPU: region.check.tid:
; AMDGPU-NEXT: [[TMP0:%.*]] = call fastcc i32 @__kmpc_get_hardware_thread_id_in_block()
; AMDGPU-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
; AMDGPU-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
; AMDGPU: region.guarded:
; AMDGPU-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA18]]
; AMDGPU-NEXT: br label [[REGION_GUARDED_END:%.*]]
; AMDGPU: region.guarded.end:
; AMDGPU-NEXT: br label [[REGION_BARRIER]]
; AMDGPU: region.barrier:
; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP0]])
; AMDGPU-NEXT: br label [[REGION_EXIT:%.*]]
; AMDGPU: region.exit:
; AMDGPU-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU: for.cond:
; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[REGION_EXIT]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU: for.cond.cleanup:
; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-NEXT: ret void
; AMDGPU: for.body:
; AMDGPU-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
; AMDGPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__6
; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; NVPTX-NEXT: br label [[REGION_CHECK_TID:%.*]]
; NVPTX: region.check.tid:
; NVPTX-NEXT: [[TMP0:%.*]] = call fastcc i32 @__kmpc_get_hardware_thread_id_in_block()
; NVPTX-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
; NVPTX-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
; NVPTX: region.guarded:
; NVPTX-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA18]]
; NVPTX-NEXT: br label [[REGION_GUARDED_END:%.*]]
; NVPTX: region.guarded.end:
; NVPTX-NEXT: br label [[REGION_BARRIER]]
; NVPTX: region.barrier:
; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP0]])
; NVPTX-NEXT: br label [[REGION_EXIT:%.*]]
; NVPTX: region.exit:
; NVPTX-NEXT: br label [[FOR_COND:%.*]]
; NVPTX: for.cond:
; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[REGION_EXIT]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX: for.cond.cleanup:
; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-NEXT: ret void
; NVPTX: for.body:
; NVPTX-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
; NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__6
; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; AMDGPU-DISABLED1-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU-DISABLED1: for.cond:
; AMDGPU-DISABLED1-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-DISABLED1-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-DISABLED1-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU-DISABLED1: for.cond.cleanup:
; AMDGPU-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: for.body:
; AMDGPU-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; AMDGPU-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__6
; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; AMDGPU-DISABLED2-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU-DISABLED2: for.cond:
; AMDGPU-DISABLED2-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-DISABLED2-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-DISABLED2-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU-DISABLED2: for.cond.cleanup:
; AMDGPU-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: for.body:
; AMDGPU-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; AMDGPU-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__6
; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; NVPTX-DISABLED1-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED1-NEXT: br label [[FOR_COND:%.*]]
; NVPTX-DISABLED1: for.cond:
; NVPTX-DISABLED1-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-DISABLED1-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-DISABLED1-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX-DISABLED1: for.cond.cleanup:
; NVPTX-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: for.body:
; NVPTX-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; NVPTX-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__6
; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; NVPTX-DISABLED2-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED2-NEXT: br label [[FOR_COND:%.*]]
; NVPTX-DISABLED2: for.cond:
; NVPTX-DISABLED2-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-DISABLED2-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-DISABLED2-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX-DISABLED2: for.cond.cleanup:
; NVPTX-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: for.body:
; NVPTX-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; NVPTX-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6
; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; AMDGPU-DISABLED-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]]
; AMDGPU-DISABLED: for.cond:
; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; AMDGPU-DISABLED: for.cond.cleanup:
; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: for.body:
; AMDGPU-DISABLED-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6
; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
; NVPTX-DISABLED-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]]
; NVPTX-DISABLED: for.cond:
; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
; NVPTX-DISABLED: for.cond.cleanup:
; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: for.body:
; NVPTX-DISABLED-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
entry:
%captured_vars_addrs = alloca [1 x ptr], align 8
%x = call align 4 ptr @__kmpc_alloc_shared(i64 4)
store i32 42, ptr %x, align 4, !tbaa !18
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp slt i32 %i.0, 100
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
call void @spmd_amenable() #10
call void @__kmpc_free_shared(ptr %x, i64 4)
ret void
for.body: ; preds = %for.cond
store ptr %x, ptr %captured_vars_addrs, align 8, !tbaa !26
%0 = load i32, ptr %.global_tid., align 4, !tbaa !18
call void @__kmpc_parallel_51(ptr @1, i32 %0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr %captured_vars_addrs, i64 1)
%inc = add nsw i32 %i.0, 1
br label %for.cond, !llvm.loop !29
}
; Function Attrs: alwaysinline convergent norecurse nounwind
define internal void @__omp_outlined__7(ptr noalias %.global_tid., ptr noalias %.bound_tid., ptr nonnull align 4 dereferenceable(4) %x) {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7
; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; AMDGPU-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
; AMDGPU-NEXT: ret void
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7
; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; NVPTX-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
; NVPTX-NEXT: ret void
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__7
; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; AMDGPU-DISABLED1-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED1-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
; AMDGPU-DISABLED1-NEXT: ret void
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__7
; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; AMDGPU-DISABLED2-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED2-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
; AMDGPU-DISABLED2-NEXT: ret void
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__7
; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; NVPTX-DISABLED1-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED1-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
; NVPTX-DISABLED1-NEXT: ret void
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__7
; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; NVPTX-DISABLED2-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED2-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
; NVPTX-DISABLED2-NEXT: ret void
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7
; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; AMDGPU-DISABLED-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; AMDGPU-DISABLED-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
; AMDGPU-DISABLED-NEXT: ret void
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7
; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
; NVPTX-DISABLED-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
; NVPTX-DISABLED-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
; NVPTX-DISABLED-NEXT: ret void
entry:
%0 = load i32, ptr %x, align 4, !tbaa !18
%inc = add nsw i32 %0, 1
store i32 %inc, ptr %x, align 4, !tbaa !18
call void @unknowni32p(ptr %x) #11
ret void
}
; Function Attrs: convergent norecurse nounwind
define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; AMDGPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; AMDGPU-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; AMDGPU-NEXT: ret void
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; NVPTX-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; NVPTX-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; NVPTX-NEXT: ret void
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
; AMDGPU-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: ret void
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
; AMDGPU-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: ret void
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
; NVPTX-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: ret void
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
; NVPTX-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: ret void
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; AMDGPU-DISABLED-NEXT: ret void
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
; NVPTX-DISABLED-NEXT: ret void
entry:
%.addr1 = alloca i32, align 4
%.zero.addr = alloca i32, align 4
%global_args = alloca ptr, align 8
store i32 %1, ptr %.addr1, align 4, !tbaa !18
store i32 0, ptr %.zero.addr, align 4
call void @__kmpc_get_shared_variables(ptr %global_args)
%2 = load ptr, ptr %global_args, align 8
%3 = load ptr, ptr %2, align 8, !tbaa !26
call void @__omp_outlined__7(ptr %.addr1, ptr %.zero.addr, ptr %3) #6
ret void
}
; Function Attrs: alwaysinline convergent norecurse nounwind
define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0 {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
; AMDGPU-SAME: () #[[ATTR0]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU: is_worker_check:
; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU: worker_state_machine.begin:
; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU: worker_state_machine.finished:
; AMDGPU-NEXT: ret void
; AMDGPU: worker_state_machine.is_active.check:
; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU: worker_state_machine.parallel_region.fallback.execute:
; AMDGPU-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU: worker_state_machine.parallel_region.end:
; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU: worker_state_machine.done.barrier:
; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU: thread.user_code.check:
; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU: common.ret:
; AMDGPU-NEXT: ret void
; AMDGPU: user_code.entry:
; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-NEXT: br label [[COMMON_RET]]
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
; NVPTX-SAME: () #[[ATTR0]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX: is_worker_check:
; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX: worker_state_machine.begin:
; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX: worker_state_machine.finished:
; NVPTX-NEXT: ret void
; NVPTX: worker_state_machine.is_active.check:
; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX: worker_state_machine.parallel_region.fallback.execute:
; NVPTX-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX: worker_state_machine.parallel_region.end:
; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX: worker_state_machine.done.barrier:
; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX: thread.user_code.check:
; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX: common.ret:
; NVPTX-NEXT: ret void
; NVPTX: user_code.entry:
; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-NEXT: call void @__kmpc_target_deinit()
; NVPTX-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
; AMDGPU-DISABLED1-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
; AMDGPU-DISABLED1-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED1-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED1: is_worker_check:
; AMDGPU-DISABLED1-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED1-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED1-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED1-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED1-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.begin:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED1-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED1-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED1-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.finished:
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: worker_state_machine.is_active.check:
; AMDGPU-DISABLED1-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.fallback.execute:
; AMDGPU-DISABLED1-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED1: worker_state_machine.done.barrier:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED1: thread.user_code.check:
; AMDGPU-DISABLED1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED1: common.ret:
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: user_code.entry:
; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
; AMDGPU-DISABLED2-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
; AMDGPU-DISABLED2-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED2-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED2: is_worker_check:
; AMDGPU-DISABLED2-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED2-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED2-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED2-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED2-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.begin:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED2-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED2-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED2-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.finished:
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: worker_state_machine.is_active.check:
; AMDGPU-DISABLED2-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.fallback.execute:
; AMDGPU-DISABLED2-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED2: worker_state_machine.done.barrier:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED2: thread.user_code.check:
; AMDGPU-DISABLED2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED2: common.ret:
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: user_code.entry:
; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]]
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
; NVPTX-DISABLED1-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
; NVPTX-DISABLED1-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED1-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED1: is_worker_check:
; NVPTX-DISABLED1-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED1-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED1-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED1-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED1-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED1: worker_state_machine.begin:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED1-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED1-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED1-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED1: worker_state_machine.finished:
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: worker_state_machine.is_active.check:
; NVPTX-DISABLED1-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.fallback.execute:
; NVPTX-DISABLED1-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED1: worker_state_machine.done.barrier:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED1: thread.user_code.check:
; NVPTX-DISABLED1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED1: common.ret:
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: user_code.entry:
; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]]
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
; NVPTX-DISABLED2-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
; NVPTX-DISABLED2-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED2-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED2: is_worker_check:
; NVPTX-DISABLED2-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED2-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED2-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED2-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED2-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED2: worker_state_machine.begin:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED2-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED2-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED2-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED2: worker_state_machine.finished:
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: worker_state_machine.is_active.check:
; NVPTX-DISABLED2-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.fallback.execute:
; NVPTX-DISABLED2-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED2: worker_state_machine.done.barrier:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED2: thread.user_code.check:
; NVPTX-DISABLED2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED2: common.ret:
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: user_code.entry:
; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment)
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED: is_worker_check:
; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED: worker_state_machine.begin:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED: worker_state_machine.finished:
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: worker_state_machine.is_active.check:
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute:
; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED: worker_state_machine.done.barrier:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED: thread.user_code.check:
; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED: common.ret:
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: user_code.entry:
; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR5]]
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]]
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment)
; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED: is_worker_check:
; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED: worker_state_machine.begin:
; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED: worker_state_machine.finished:
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: worker_state_machine.is_active.check:
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute:
; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED: worker_state_machine.done.barrier:
; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED: thread.user_code.check:
; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED: common.ret:
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: user_code.entry:
; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR5]]
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]]
entry:
%.zero.addr = alloca i32, align 4
%.threadid_temp. = alloca i32, align 4
%0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
%exec_user_code = icmp eq i32 %0, -1
br i1 %exec_user_code, label %user_code.entry, label %common.ret
common.ret: ; preds = %entry, %user_code.entry
ret void
user_code.entry: ; preds = %entry
%1 = call i32 @__kmpc_global_thread_num(ptr @1)
store i32 0, ptr %.zero.addr, align 4
store i32 %1, ptr %.threadid_temp., align 4, !tbaa !18
call void @__omp_outlined__8(ptr %.threadid_temp., ptr %.zero.addr) #6
call void @__kmpc_target_deinit()
br label %common.ret
}
; Function Attrs: alwaysinline convergent norecurse nounwind
define internal void @__omp_outlined__8(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__8
; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-NEXT: ret void
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__8
; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-NEXT: ret void
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__8
; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-DISABLED1-NEXT: ret void
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__8
; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-DISABLED2-NEXT: ret void
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__8
; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-DISABLED1-NEXT: ret void
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__8
; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-DISABLED2-NEXT: ret void
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8
; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-DISABLED-NEXT: ret void
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8
; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-DISABLED-NEXT: ret void
entry:
call void @unknown() #11
ret void
}
; Function Attrs: alwaysinline convergent norecurse nounwind
define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
; AMDGPU-SAME: () #[[ATTR0]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU: is_worker_check:
; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU: worker_state_machine.begin:
; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU: worker_state_machine.finished:
; AMDGPU-NEXT: ret void
; AMDGPU: worker_state_machine.is_active.check:
; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU: worker_state_machine.parallel_region.check:
; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID
; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; AMDGPU: worker_state_machine.parallel_region.execute:
; AMDGPU-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU: worker_state_machine.parallel_region.fallback.execute:
; AMDGPU-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU: worker_state_machine.parallel_region.end:
; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU: worker_state_machine.done.barrier:
; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU: thread.user_code.check:
; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU: common.ret:
; AMDGPU-NEXT: ret void
; AMDGPU: user_code.entry:
; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
; AMDGPU-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; AMDGPU-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-NEXT: br label [[COMMON_RET]]
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
; NVPTX-SAME: () #[[ATTR0]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX: is_worker_check:
; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX: worker_state_machine.begin:
; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX: worker_state_machine.finished:
; NVPTX-NEXT: ret void
; NVPTX: worker_state_machine.is_active.check:
; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX: worker_state_machine.parallel_region.check:
; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID
; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; NVPTX: worker_state_machine.parallel_region.execute:
; NVPTX-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX: worker_state_machine.parallel_region.fallback.execute:
; NVPTX-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX: worker_state_machine.parallel_region.end:
; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX: worker_state_machine.done.barrier:
; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX: thread.user_code.check:
; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX: common.ret:
; NVPTX-NEXT: ret void
; NVPTX: user_code.entry:
; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
; NVPTX-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; NVPTX-NEXT: call void @__kmpc_target_deinit()
; NVPTX-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
; AMDGPU-DISABLED1-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
; AMDGPU-DISABLED1-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED1-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED1: is_worker_check:
; AMDGPU-DISABLED1-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED1-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED1-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED1-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED1-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.begin:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED1-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED1-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED1-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED1-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.finished:
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: worker_state_machine.is_active.check:
; AMDGPU-DISABLED1-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED1-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID
; AMDGPU-DISABLED1-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.fallback.execute:
; AMDGPU-DISABLED1-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED1: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED1: worker_state_machine.done.barrier:
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED1: thread.user_code.check:
; AMDGPU-DISABLED1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED1: common.ret:
; AMDGPU-DISABLED1-NEXT: ret void
; AMDGPU-DISABLED1: user_code.entry:
; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
; AMDGPU-DISABLED2-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
; AMDGPU-DISABLED2-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED2-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED2: is_worker_check:
; AMDGPU-DISABLED2-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED2-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED2-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED2-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED2-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.begin:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED2-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED2-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED2-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED2-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.finished:
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: worker_state_machine.is_active.check:
; AMDGPU-DISABLED2-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED2-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID
; AMDGPU-DISABLED2-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.fallback.execute:
; AMDGPU-DISABLED2-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED2: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED2: worker_state_machine.done.barrier:
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED2: thread.user_code.check:
; AMDGPU-DISABLED2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED2: common.ret:
; AMDGPU-DISABLED2-NEXT: ret void
; AMDGPU-DISABLED2: user_code.entry:
; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]]
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
; NVPTX-DISABLED1-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
; NVPTX-DISABLED1-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED1-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED1: is_worker_check:
; NVPTX-DISABLED1-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED1-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED1-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED1-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED1-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED1: worker_state_machine.begin:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED1-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED1-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED1-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED1: worker_state_machine.finished:
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: worker_state_machine.is_active.check:
; NVPTX-DISABLED1-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED1-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID
; NVPTX-DISABLED1-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.fallback.execute:
; NVPTX-DISABLED1-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED1: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED1: worker_state_machine.done.barrier:
; NVPTX-DISABLED1-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED1-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED1: thread.user_code.check:
; NVPTX-DISABLED1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED1: common.ret:
; NVPTX-DISABLED1-NEXT: ret void
; NVPTX-DISABLED1: user_code.entry:
; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]]
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
; NVPTX-DISABLED2-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
; NVPTX-DISABLED2-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED2-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED2: is_worker_check:
; NVPTX-DISABLED2-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED2-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED2-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED2-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED2-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED2: worker_state_machine.begin:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED2-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED2-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED2-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED2: worker_state_machine.finished:
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: worker_state_machine.is_active.check:
; NVPTX-DISABLED2-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED2-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID
; NVPTX-DISABLED2-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.fallback.execute:
; NVPTX-DISABLED2-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED2: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED2: worker_state_machine.done.barrier:
; NVPTX-DISABLED2-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED2-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED2: thread.user_code.check:
; NVPTX-DISABLED2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED2: common.ret:
; NVPTX-DISABLED2-NEXT: ret void
; NVPTX-DISABLED2: user_code.entry:
; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]]
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment)
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; AMDGPU-DISABLED: is_worker_check:
; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; AMDGPU-DISABLED: worker_state_machine.begin:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; AMDGPU-DISABLED: worker_state_machine.finished:
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: worker_state_machine.is_active.check:
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.check:
; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute:
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute:
; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; AMDGPU-DISABLED: worker_state_machine.parallel_region.end:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; AMDGPU-DISABLED: worker_state_machine.done.barrier:
; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; AMDGPU-DISABLED: thread.user_code.check:
; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; AMDGPU-DISABLED: common.ret:
; AMDGPU-DISABLED-NEXT: ret void
; AMDGPU-DISABLED: user_code.entry:
; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit()
; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]]
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment)
; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; NVPTX-DISABLED: is_worker_check:
; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
; NVPTX-DISABLED: worker_state_machine.begin:
; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; NVPTX-DISABLED: worker_state_machine.finished:
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: worker_state_machine.is_active.check:
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.check:
; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.execute:
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute:
; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; NVPTX-DISABLED: worker_state_machine.parallel_region.end:
; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; NVPTX-DISABLED: worker_state_machine.done.barrier:
; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; NVPTX-DISABLED: thread.user_code.check:
; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; NVPTX-DISABLED: common.ret:
; NVPTX-DISABLED-NEXT: ret void
; NVPTX-DISABLED: user_code.entry:
; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit()
; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]]
entry:
%captured_vars_addrs = alloca [0 x ptr], align 8
%0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
%exec_user_code = icmp eq i32 %0, -1
br i1 %exec_user_code, label %user_code.entry, label %common.ret
common.ret: ; preds = %entry, %user_code.entry
ret void
user_code.entry: ; preds = %entry
%1 = call i32 @__kmpc_global_thread_num(ptr @1)
%2 = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %1, i32 1, i64 40, i64 0, ptr @"_omp_task_entry$")
%3 = call i32 @__kmpc_omp_task(ptr @1, i32 %1, ptr %2)
call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper, ptr %captured_vars_addrs, i64 0)
call void @__kmpc_target_deinit()
br label %common.ret
}
; Function Attrs: alwaysinline convergent nounwind
define internal void @.omp_outlined.(i32 %.global_tid., ptr noalias %.part_id., ptr noalias %.privates., ptr noalias %.copy_fn., ptr %.task_t., ptr noalias %__context) #9 {
; AMDGPU-LABEL: define {{[^@]+}}@.omp_outlined.
; AMDGPU-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-NEXT: ret void
;
; NVPTX-LABEL: define {{[^@]+}}@.omp_outlined.
; NVPTX-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-NEXT: ret void
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@.omp_outlined.
; AMDGPU-DISABLED1-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-DISABLED1-NEXT: ret void
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@.omp_outlined.
; AMDGPU-DISABLED2-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-DISABLED2-NEXT: ret void
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@.omp_outlined.
; NVPTX-DISABLED1-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-DISABLED1-NEXT: ret void
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@.omp_outlined.
; NVPTX-DISABLED2-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-DISABLED2-NEXT: ret void
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@.omp_outlined.
; AMDGPU-DISABLED-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]]
; AMDGPU-DISABLED-NEXT: ret void
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@.omp_outlined.
; NVPTX-DISABLED-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]]
; NVPTX-DISABLED-NEXT: ret void
entry:
call void @spmd_amenable() #10
ret void
}
; Function Attrs: convergent norecurse nounwind
define internal i32 @"_omp_task_entry$"(i32 %0, ptr noalias %1) #3 {
entry:
%2 = getelementptr inbounds %struct.kmp_task_t, ptr %1, i32 0, i32 2
%3 = load ptr, ptr %1, align 8, !tbaa !30
call void @.omp_outlined.(i32 %0, ptr %2, ptr null, ptr null, ptr %1, ptr %3) #6
ret i32 0
}
; Function Attrs: nounwind
declare ptr @__kmpc_omp_task_alloc(ptr, i32, i32, i64, i64, ptr) #6
; Function Attrs: nounwind
declare i32 @__kmpc_omp_task(ptr, i32, ptr) #6
; Function Attrs: nosync nounwind
declare void @__kmpc_free_shared(ptr nocapture, i64) #8
; Function Attrs: nofree nosync nounwind
declare ptr @__kmpc_alloc_shared(i64) #7
; Function Attrs: convergent
declare void @use(ptr nocapture) #5
; Function Attrs: convergent
declare void @unknown() #2
declare void @unknowni32p(ptr) #2
; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
define weak i32 @__kmpc_target_init(ptr, ptr) {
; AMDGPU-LABEL: define {{[^@]+}}@__kmpc_target_init
; AMDGPU-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
; AMDGPU-NEXT: ret i32 0
;
; NVPTX-LABEL: define {{[^@]+}}@__kmpc_target_init
; NVPTX-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
; NVPTX-NEXT: ret i32 0
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__kmpc_target_init
; AMDGPU-DISABLED1-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
; AMDGPU-DISABLED1-NEXT: ret i32 0
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__kmpc_target_init
; AMDGPU-DISABLED2-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
; AMDGPU-DISABLED2-NEXT: ret i32 0
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__kmpc_target_init
; NVPTX-DISABLED1-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
; NVPTX-DISABLED1-NEXT: ret i32 0
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__kmpc_target_init
; NVPTX-DISABLED2-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
; NVPTX-DISABLED2-NEXT: ret i32 0
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__kmpc_target_init
; AMDGPU-DISABLED-SAME: (ptr [[TMP0:%.*]]) {
; AMDGPU-DISABLED-NEXT: ret i32 0
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__kmpc_target_init
; NVPTX-DISABLED-SAME: (ptr [[TMP0:%.*]]) {
; NVPTX-DISABLED-NEXT: ret i32 0
ret i32 0
}
declare void @__kmpc_get_shared_variables(ptr)
; Function Attrs: alwaysinline
declare void @__kmpc_parallel_51(ptr, i32, i32, i32, i32, ptr, ptr, ptr, i64) #4
; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
; Function Attrs: convergent
declare void @spmd_amenable() #5
; Function Attrs: nounwind
declare i32 @__kmpc_global_thread_num(ptr) #6
declare void @__kmpc_target_deinit()
; Function Attrs: alwaysinline convergent norecurse nounwind
define internal void @__omp_outlined__9(ptr noalias %.global_tid., ptr noalias %.bound_tid.) {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__9
; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-NEXT: ret void
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__9
; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-NEXT: ret void
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__9
; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-DISABLED1-NEXT: ret void
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__9
; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-DISABLED2-NEXT: ret void
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__9
; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-DISABLED1-NEXT: ret void
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__9
; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-DISABLED2-NEXT: ret void
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9
; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8]]
; AMDGPU-DISABLED-NEXT: ret void
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9
; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8]]
; NVPTX-DISABLED-NEXT: ret void
entry:
call void @unknown() #11
ret void
}
; Function Attrs: convergent norecurse nounwind
define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #3 {
; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-NEXT: entry:
; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-NEXT: ret void
;
; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-NEXT: entry:
; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-NEXT: ret void
;
; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
; AMDGPU-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-DISABLED1-NEXT: entry:
; AMDGPU-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED1-NEXT: ret void
;
; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
; AMDGPU-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-DISABLED2-NEXT: entry:
; AMDGPU-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED2-NEXT: ret void
;
; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
; NVPTX-DISABLED1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-DISABLED1-NEXT: entry:
; NVPTX-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED1-NEXT: ret void
;
; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
; NVPTX-DISABLED2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-DISABLED2-NEXT: entry:
; NVPTX-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED2-NEXT: ret void
;
; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; AMDGPU-DISABLED-NEXT: entry:
; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; AMDGPU-DISABLED-NEXT: ret void
; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
; NVPTX-DISABLED-NEXT: entry:
; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
; NVPTX-DISABLED-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
; NVPTX-DISABLED-NEXT: ret void
entry:
%.addr1 = alloca i32, align 4
%.zero.addr = alloca i32, align 4
%global_args = alloca ptr, align 8
store i32 %1, ptr %.addr1, align 4, !tbaa !18
store i32 0, ptr %.zero.addr, align 4
call void @__kmpc_get_shared_variables(ptr %global_args)
call void @__omp_outlined__9(ptr %.addr1, ptr %.zero.addr) #6
ret void
}
declare fastcc i32 @__kmpc_get_hardware_thread_id_in_block();
attributes #0 = { alwaysinline convergent norecurse nounwind "kernel" }
attributes #1 = { argmemonly mustprogress nofree nosync nounwind willreturn }
attributes #2 = { convergent }
attributes #3 = { convergent norecurse nounwind }
attributes #4 = { alwaysinline }
attributes #5 = { convergent "llvm.assume"="ompx_spmd_amenable" }
attributes #6 = { nounwind }
attributes #7 = { nofree nosync nounwind }
attributes #8 = { nosync nounwind }
attributes #9 = { alwaysinline convergent nounwind }
attributes #10 = { convergent "llvm.assume"="ompx_spmd_amenable" }
attributes #11 = { convergent }
!omp_offload.info = !{!0, !1, !2, !3, !4, !5}
!nvvm.annotations = !{!6, !7, !8, !9, !10, !11}
!llvm.module.flags = !{!12, !13, !14, !15, !16}
!llvm.ident = !{!17}
!0 = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
!1 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
!2 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
!3 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
!4 = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
!5 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
!6 = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
!7 = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
!8 = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
!9 = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
!10 = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
!11 = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
!12 = !{i32 1, !"wchar_size", i32 4}
!13 = !{i32 7, !"openmp", i32 50}
!14 = !{i32 7, !"openmp-device", i32 50}
!15 = !{i32 8, !"PIC Level", i32 2}
!16 = !{i32 7, !"frame-pointer", i32 2}
!17 = !{!"clang version 14.0.0"}
!18 = !{!19, !19, i64 0}
!19 = !{!"int", !20, i64 0}
!20 = !{!"omnipotent char", !21, i64 0}
!21 = !{!"Simple C/C++ TBAA"}
!22 = distinct !{!22, !23, !24}
!23 = !{!"llvm.loop.mustprogress"}
!24 = !{!"llvm.loop.unroll.disable"}
!25 = distinct !{!25, !23, !24}
!26 = !{!27, !27, i64 0}
!27 = !{!"any pointer", !20, i64 0}
!28 = distinct !{!28, !23, !24}
!29 = distinct !{!29, !23, !24}
!30 = !{!31, !27, i64 0}
!31 = !{!"kmp_task_t_with_privates", !32, i64 0}
!32 = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32}
; AMDGPU-DISABLED: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind }
; AMDGPU-DISABLED: attributes #[[ATTR1]] = { norecurse }
; AMDGPU-DISABLED: attributes #[[ATTR2]] = { convergent norecurse nounwind }
; AMDGPU-DISABLED: attributes #[[ATTR3]] = { alwaysinline norecurse nounwind }
; AMDGPU-DISABLED: attributes #[[ATTR4]] = { alwaysinline convergent nounwind }
; AMDGPU-DISABLED: attributes #[[ATTR5]] = { nounwind }
; AMDGPU-DISABLED: attributes #[[ATTR6:[0-9]+]] = { nosync nounwind }
; AMDGPU-DISABLED: attributes #[[ATTR7:[0-9]+]] = { nofree nosync nounwind allocsize(0) }
; AMDGPU-DISABLED: attributes #[[ATTR8]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
; AMDGPU-DISABLED: attributes #[[ATTR9]] = { convergent }
; AMDGPU-DISABLED: attributes #[[ATTR10:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
; AMDGPU-DISABLED: attributes #[[ATTR11:[0-9]+]] = { alwaysinline }
; AMDGPU-DISABLED: attributes #[[ATTR12:[0-9]+]] = { convergent nounwind }
; NVPTX-DISABLED: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind }
; NVPTX-DISABLED: attributes #[[ATTR1]] = { norecurse }
; NVPTX-DISABLED: attributes #[[ATTR2]] = { convergent norecurse nounwind }
; NVPTX-DISABLED: attributes #[[ATTR3]] = { alwaysinline norecurse nounwind }
; NVPTX-DISABLED: attributes #[[ATTR4]] = { alwaysinline convergent nounwind }
; NVPTX-DISABLED: attributes #[[ATTR5]] = { nounwind }
; NVPTX-DISABLED: attributes #[[ATTR6:[0-9]+]] = { nosync nounwind }
; NVPTX-DISABLED: attributes #[[ATTR7:[0-9]+]] = { nofree nosync nounwind allocsize(0) }
; NVPTX-DISABLED: attributes #[[ATTR8]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
; NVPTX-DISABLED: attributes #[[ATTR9]] = { convergent }
; NVPTX-DISABLED: attributes #[[ATTR10:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
; NVPTX-DISABLED: attributes #[[ATTR11:[0-9]+]] = { alwaysinline }
; NVPTX-DISABLED: attributes #[[ATTR12:[0-9]+]] = { convergent nounwind }
; AMDGPU-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
; AMDGPU-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
; AMDGPU-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
; AMDGPU-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
; AMDGPU-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
; AMDGPU-DISABLED: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
; AMDGPU-DISABLED: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
; AMDGPU-DISABLED: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
; AMDGPU-DISABLED: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
; AMDGPU-DISABLED: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
; AMDGPU-DISABLED: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
; AMDGPU-DISABLED: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
; AMDGPU-DISABLED: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
; AMDGPU-DISABLED: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
; AMDGPU-DISABLED: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
; AMDGPU-DISABLED: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
; AMDGPU-DISABLED: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
; AMDGPU-DISABLED: [[META17:![0-9]+]] = !{!"clang version 14.0.0"}
; AMDGPU-DISABLED: [[TBAA18]] = !{!19, !19, i64 0}
; AMDGPU-DISABLED: [[META19:![0-9]+]] = !{!"int", !20, i64 0}
; AMDGPU-DISABLED: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0}
; AMDGPU-DISABLED: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"}
; AMDGPU-DISABLED: [[LOOP22]] = distinct !{!22, !23, !24}
; AMDGPU-DISABLED: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"}
; AMDGPU-DISABLED: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
; AMDGPU-DISABLED: [[LOOP25]] = distinct !{!25, !23, !24}
; AMDGPU-DISABLED: [[TBAA26]] = !{!27, !27, i64 0}
; AMDGPU-DISABLED: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
; AMDGPU-DISABLED: [[LOOP28]] = distinct !{!28, !23, !24}
; AMDGPU-DISABLED: [[LOOP29]] = distinct !{!29, !23, !24}
; NVPTX-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
; NVPTX-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
; NVPTX-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
; NVPTX-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
; NVPTX-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
; NVPTX-DISABLED: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
; NVPTX-DISABLED: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
; NVPTX-DISABLED: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
; NVPTX-DISABLED: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
; NVPTX-DISABLED: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
; NVPTX-DISABLED: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
; NVPTX-DISABLED: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
; NVPTX-DISABLED: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
; NVPTX-DISABLED: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
; NVPTX-DISABLED: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
; NVPTX-DISABLED: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
; NVPTX-DISABLED: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
; NVPTX-DISABLED: [[META17:![0-9]+]] = !{!"clang version 14.0.0"}
; NVPTX-DISABLED: [[TBAA18]] = !{!19, !19, i64 0}
; NVPTX-DISABLED: [[META19:![0-9]+]] = !{!"int", !20, i64 0}
; NVPTX-DISABLED: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0}
; NVPTX-DISABLED: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"}
; NVPTX-DISABLED: [[LOOP22]] = distinct !{!22, !23, !24}
; NVPTX-DISABLED: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"}
; NVPTX-DISABLED: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
; NVPTX-DISABLED: [[LOOP25]] = distinct !{!25, !23, !24}
; NVPTX-DISABLED: [[TBAA26]] = !{!27, !27, i64 0}
; NVPTX-DISABLED: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
; NVPTX-DISABLED: [[LOOP28]] = distinct !{!28, !23, !24}
; NVPTX-DISABLED: [[LOOP29]] = distinct !{!29, !23, !24}
;.
; AMDGPU: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind "kernel" }
; AMDGPU: attributes #[[ATTR1]] = { norecurse }
; AMDGPU: attributes #[[ATTR2]] = { convergent norecurse nounwind }
; AMDGPU: attributes #[[ATTR3]] = { alwaysinline convergent nounwind }
; AMDGPU: attributes #[[ATTR4]] = { nounwind }
; AMDGPU: attributes #[[ATTR5:[0-9]+]] = { nosync nounwind }
; AMDGPU: attributes #[[ATTR6:[0-9]+]] = { nofree nosync nounwind allocsize(0) }
; AMDGPU: attributes #[[ATTR7]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
; AMDGPU: attributes #[[ATTR8]] = { convergent }
; AMDGPU: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
; AMDGPU: attributes #[[ATTR10:[0-9]+]] = { alwaysinline }
; AMDGPU: attributes #[[ATTR11:[0-9]+]] = { convergent nounwind }
;.
; NVPTX: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind "kernel" }
; NVPTX: attributes #[[ATTR1]] = { norecurse }
; NVPTX: attributes #[[ATTR2]] = { convergent norecurse nounwind }
; NVPTX: attributes #[[ATTR3]] = { alwaysinline convergent nounwind }
; NVPTX: attributes #[[ATTR4]] = { nounwind }
; NVPTX: attributes #[[ATTR5:[0-9]+]] = { nosync nounwind }
; NVPTX: attributes #[[ATTR6:[0-9]+]] = { nofree nosync nounwind allocsize(0) }
; NVPTX: attributes #[[ATTR7]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
; NVPTX: attributes #[[ATTR8]] = { convergent }
; NVPTX: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
; NVPTX: attributes #[[ATTR10:[0-9]+]] = { alwaysinline }
; NVPTX: attributes #[[ATTR11:[0-9]+]] = { convergent nounwind }
;.
; AMDGPU-DISABLED1: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind "kernel" }
; AMDGPU-DISABLED1: attributes #[[ATTR1]] = { norecurse }
; AMDGPU-DISABLED1: attributes #[[ATTR2]] = { convergent norecurse nounwind }
; AMDGPU-DISABLED1: attributes #[[ATTR3]] = { alwaysinline convergent nounwind }
; AMDGPU-DISABLED1: attributes #[[ATTR4]] = { nounwind }
; AMDGPU-DISABLED1: attributes #[[ATTR5:[0-9]+]] = { nosync nounwind }
; AMDGPU-DISABLED1: attributes #[[ATTR6:[0-9]+]] = { nofree nosync nounwind allocsize(0) }
; AMDGPU-DISABLED1: attributes #[[ATTR7]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
; AMDGPU-DISABLED1: attributes #[[ATTR8]] = { convergent }
; AMDGPU-DISABLED1: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
; AMDGPU-DISABLED1: attributes #[[ATTR10:[0-9]+]] = { alwaysinline }
; AMDGPU-DISABLED1: attributes #[[ATTR11:[0-9]+]] = { convergent nounwind }
;.
; AMDGPU-DISABLED2: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind "kernel" }
; AMDGPU-DISABLED2: attributes #[[ATTR1]] = { norecurse }
; AMDGPU-DISABLED2: attributes #[[ATTR2]] = { convergent norecurse nounwind }
; AMDGPU-DISABLED2: attributes #[[ATTR3]] = { alwaysinline convergent nounwind }
; AMDGPU-DISABLED2: attributes #[[ATTR4]] = { nounwind }
; AMDGPU-DISABLED2: attributes #[[ATTR5:[0-9]+]] = { nosync nounwind }
; AMDGPU-DISABLED2: attributes #[[ATTR6:[0-9]+]] = { nofree nosync nounwind allocsize(0) }
; AMDGPU-DISABLED2: attributes #[[ATTR7]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
; AMDGPU-DISABLED2: attributes #[[ATTR8]] = { convergent }
; AMDGPU-DISABLED2: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
; AMDGPU-DISABLED2: attributes #[[ATTR10:[0-9]+]] = { alwaysinline }
; AMDGPU-DISABLED2: attributes #[[ATTR11:[0-9]+]] = { convergent nounwind }
;.
; NVPTX-DISABLED1: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind "kernel" }
; NVPTX-DISABLED1: attributes #[[ATTR1]] = { norecurse }
; NVPTX-DISABLED1: attributes #[[ATTR2]] = { convergent norecurse nounwind }
; NVPTX-DISABLED1: attributes #[[ATTR3]] = { alwaysinline convergent nounwind }
; NVPTX-DISABLED1: attributes #[[ATTR4]] = { nounwind }
; NVPTX-DISABLED1: attributes #[[ATTR5:[0-9]+]] = { nosync nounwind }
; NVPTX-DISABLED1: attributes #[[ATTR6:[0-9]+]] = { nofree nosync nounwind allocsize(0) }
; NVPTX-DISABLED1: attributes #[[ATTR7]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
; NVPTX-DISABLED1: attributes #[[ATTR8]] = { convergent }
; NVPTX-DISABLED1: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
; NVPTX-DISABLED1: attributes #[[ATTR10:[0-9]+]] = { alwaysinline }
; NVPTX-DISABLED1: attributes #[[ATTR11:[0-9]+]] = { convergent nounwind }
;.
; NVPTX-DISABLED2: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind "kernel" }
; NVPTX-DISABLED2: attributes #[[ATTR1]] = { norecurse }
; NVPTX-DISABLED2: attributes #[[ATTR2]] = { convergent norecurse nounwind }
; NVPTX-DISABLED2: attributes #[[ATTR3]] = { alwaysinline convergent nounwind }
; NVPTX-DISABLED2: attributes #[[ATTR4]] = { nounwind }
; NVPTX-DISABLED2: attributes #[[ATTR5:[0-9]+]] = { nosync nounwind }
; NVPTX-DISABLED2: attributes #[[ATTR6:[0-9]+]] = { nofree nosync nounwind allocsize(0) }
; NVPTX-DISABLED2: attributes #[[ATTR7]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
; NVPTX-DISABLED2: attributes #[[ATTR8]] = { convergent }
; NVPTX-DISABLED2: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
; NVPTX-DISABLED2: attributes #[[ATTR10:[0-9]+]] = { alwaysinline }
; NVPTX-DISABLED2: attributes #[[ATTR11:[0-9]+]] = { convergent nounwind }
;.
; AMDGPU: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
; AMDGPU: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
; AMDGPU: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
; AMDGPU: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
; AMDGPU: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
; AMDGPU: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
; AMDGPU: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
; AMDGPU: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
; AMDGPU: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
; AMDGPU: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
; AMDGPU: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
; AMDGPU: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
; AMDGPU: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
; AMDGPU: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
; AMDGPU: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
; AMDGPU: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
; AMDGPU: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
; AMDGPU: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
; AMDGPU: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
; AMDGPU: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
; AMDGPU: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
; AMDGPU: [[META21]] = !{!"Simple C/C++ TBAA"}
; AMDGPU: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
; AMDGPU: [[META23]] = !{!"llvm.loop.mustprogress"}
; AMDGPU: [[META24]] = !{!"llvm.loop.unroll.disable"}
; AMDGPU: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
; AMDGPU: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
; AMDGPU: [[META27]] = !{!"any pointer", [[META20]], i64 0}
; AMDGPU: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
; AMDGPU: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
;.
; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
; NVPTX: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
; NVPTX: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
; NVPTX: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
; NVPTX: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
; NVPTX: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
; NVPTX: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
; NVPTX: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
; NVPTX: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
; NVPTX: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
; NVPTX: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
; NVPTX: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
; NVPTX: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
; NVPTX: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
; NVPTX: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
; NVPTX: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
; NVPTX: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
; NVPTX: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
; NVPTX: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
; NVPTX: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
; NVPTX: [[META21]] = !{!"Simple C/C++ TBAA"}
; NVPTX: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
; NVPTX: [[META23]] = !{!"llvm.loop.mustprogress"}
; NVPTX: [[META24]] = !{!"llvm.loop.unroll.disable"}
; NVPTX: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
; NVPTX: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
; NVPTX: [[META27]] = !{!"any pointer", [[META20]], i64 0}
; NVPTX: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
; NVPTX: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
;.
; AMDGPU-DISABLED1: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
; AMDGPU-DISABLED1: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
; AMDGPU-DISABLED1: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
; AMDGPU-DISABLED1: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
; AMDGPU-DISABLED1: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
; AMDGPU-DISABLED1: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
; AMDGPU-DISABLED1: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
; AMDGPU-DISABLED1: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
; AMDGPU-DISABLED1: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
; AMDGPU-DISABLED1: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
; AMDGPU-DISABLED1: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
; AMDGPU-DISABLED1: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
; AMDGPU-DISABLED1: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
; AMDGPU-DISABLED1: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
; AMDGPU-DISABLED1: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
; AMDGPU-DISABLED1: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
; AMDGPU-DISABLED1: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
; AMDGPU-DISABLED1: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
; AMDGPU-DISABLED1: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
; AMDGPU-DISABLED1: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
; AMDGPU-DISABLED1: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
; AMDGPU-DISABLED1: [[META21]] = !{!"Simple C/C++ TBAA"}
; AMDGPU-DISABLED1: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
; AMDGPU-DISABLED1: [[META23]] = !{!"llvm.loop.mustprogress"}
; AMDGPU-DISABLED1: [[META24]] = !{!"llvm.loop.unroll.disable"}
; AMDGPU-DISABLED1: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
; AMDGPU-DISABLED1: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
; AMDGPU-DISABLED1: [[META27]] = !{!"any pointer", [[META20]], i64 0}
; AMDGPU-DISABLED1: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
; AMDGPU-DISABLED1: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
;.
; AMDGPU-DISABLED2: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
; AMDGPU-DISABLED2: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
; AMDGPU-DISABLED2: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
; AMDGPU-DISABLED2: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
; AMDGPU-DISABLED2: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
; AMDGPU-DISABLED2: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
; AMDGPU-DISABLED2: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
; AMDGPU-DISABLED2: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
; AMDGPU-DISABLED2: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
; AMDGPU-DISABLED2: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
; AMDGPU-DISABLED2: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
; AMDGPU-DISABLED2: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
; AMDGPU-DISABLED2: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
; AMDGPU-DISABLED2: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
; AMDGPU-DISABLED2: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
; AMDGPU-DISABLED2: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
; AMDGPU-DISABLED2: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
; AMDGPU-DISABLED2: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
; AMDGPU-DISABLED2: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
; AMDGPU-DISABLED2: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
; AMDGPU-DISABLED2: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
; AMDGPU-DISABLED2: [[META21]] = !{!"Simple C/C++ TBAA"}
; AMDGPU-DISABLED2: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
; AMDGPU-DISABLED2: [[META23]] = !{!"llvm.loop.mustprogress"}
; AMDGPU-DISABLED2: [[META24]] = !{!"llvm.loop.unroll.disable"}
; AMDGPU-DISABLED2: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
; AMDGPU-DISABLED2: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
; AMDGPU-DISABLED2: [[META27]] = !{!"any pointer", [[META20]], i64 0}
; AMDGPU-DISABLED2: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
; AMDGPU-DISABLED2: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
;.
; NVPTX-DISABLED1: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
; NVPTX-DISABLED1: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
; NVPTX-DISABLED1: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
; NVPTX-DISABLED1: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
; NVPTX-DISABLED1: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
; NVPTX-DISABLED1: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
; NVPTX-DISABLED1: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
; NVPTX-DISABLED1: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
; NVPTX-DISABLED1: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
; NVPTX-DISABLED1: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
; NVPTX-DISABLED1: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
; NVPTX-DISABLED1: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
; NVPTX-DISABLED1: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
; NVPTX-DISABLED1: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
; NVPTX-DISABLED1: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
; NVPTX-DISABLED1: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
; NVPTX-DISABLED1: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
; NVPTX-DISABLED1: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
; NVPTX-DISABLED1: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
; NVPTX-DISABLED1: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
; NVPTX-DISABLED1: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
; NVPTX-DISABLED1: [[META21]] = !{!"Simple C/C++ TBAA"}
; NVPTX-DISABLED1: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
; NVPTX-DISABLED1: [[META23]] = !{!"llvm.loop.mustprogress"}
; NVPTX-DISABLED1: [[META24]] = !{!"llvm.loop.unroll.disable"}
; NVPTX-DISABLED1: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
; NVPTX-DISABLED1: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
; NVPTX-DISABLED1: [[META27]] = !{!"any pointer", [[META20]], i64 0}
; NVPTX-DISABLED1: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
; NVPTX-DISABLED1: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
;.
; NVPTX-DISABLED2: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
; NVPTX-DISABLED2: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
; NVPTX-DISABLED2: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
; NVPTX-DISABLED2: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
; NVPTX-DISABLED2: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
; NVPTX-DISABLED2: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
; NVPTX-DISABLED2: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
; NVPTX-DISABLED2: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
; NVPTX-DISABLED2: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
; NVPTX-DISABLED2: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
; NVPTX-DISABLED2: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
; NVPTX-DISABLED2: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
; NVPTX-DISABLED2: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
; NVPTX-DISABLED2: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
; NVPTX-DISABLED2: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
; NVPTX-DISABLED2: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
; NVPTX-DISABLED2: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
; NVPTX-DISABLED2: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
; NVPTX-DISABLED2: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
; NVPTX-DISABLED2: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
; NVPTX-DISABLED2: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
; NVPTX-DISABLED2: [[META21]] = !{!"Simple C/C++ TBAA"}
; NVPTX-DISABLED2: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
; NVPTX-DISABLED2: [[META23]] = !{!"llvm.loop.mustprogress"}
; NVPTX-DISABLED2: [[META24]] = !{!"llvm.loop.unroll.disable"}
; NVPTX-DISABLED2: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
; NVPTX-DISABLED2: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
; NVPTX-DISABLED2: [[META27]] = !{!"any pointer", [[META20]], i64 0}
; NVPTX-DISABLED2: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
; NVPTX-DISABLED2: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
;.