[SPIRV][AMDGPU][clang][CodeGen][opt] Add late-resolved feature identifying predicates (#134016)

This change adds two builtins for AMDGPU: - `__builtin_amdgcn_processor_is`, which is similar in observable behaviour with `__builtin_cpu_is`, except that it is never "evaluated" at run time; - `__builtin_amdgcn_is_invocable`, which is behaviourally similar with `__has_builtin`, except that it is not a macro (i.e. not evaluated at preprocessing time). Neither of these are `constexpr`, even though when compiling for concrete (i.e. `gfxXXX` / `gfxXXX-generic`) targets they get evaluated in Clang, so they shouldn't tear the AST too badly / at all for multi-pass compilation cases like HIP. They can only be used in specific contexts (as args to control structures). The motivation for adding these is two-fold: - as a nice to have, it provides an AST-visible way to incorporate architecture specific code, rather than having to rely on macros and the preprocessor, which burn in the choice quite early; - as a must have, it allows featureful AMDGCN flavoured SPIR-V to be produced, where target specific capability is guarded and chosen or discarded when finalising compilation for a concrete target; this is built atop the Speciali\ation Constant concept which is described in the SPIR-V specification under section [2.12 Specialization](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_specialization_2) I've tried to keep the overall footprint of the change small. The changes to Sema are a bit unpleasant, but there was a strong desire to have Clang validate these, and to constrain their uses, and this was the most compact solution I could come up with (suggestions welcome). --------- Co-authored-by: Juan Manuel Martinez Caamaño <jmartinezcaamao@gmail.com> Co-authored-by: Voicu <avoicu@amd.com>
2026-03-30 23:02:26 +01:00 · 2026-03-30 23:02:26 +01:00 · 18e6958903
commit 18e6958903
parent 5b00cdf8e1
47 changed files with 2099 additions and 161 deletions
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@ -5558,6 +5558,120 @@ If no address spaces names are provided, all address spaces are fenced.
  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local")
  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global")

+__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``__builtin_amdgcn_processor_is`` and ``__builtin_amdgcn_is_invocable`` provide
+a functional mechanism for programatically querying:
+
+* the identity of the current target processor;
+* the capability of the current target processor to invoke a particular builtin.
+
+**Syntax**:
+
+.. code-block:: c
+
+  __amdgpu_feature_predicate_t __builtin_amdgcn_processor_is(const char*);
+  __amdgpu_feature_predicate_t __builtin_amdgcn_is_invocable(builtin_name);
+
+**Example of use**:
+
+.. code-block:: c++
+
+  if (__builtin_amdgcn_processor_is("gfx1201") ||
+      __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+    __builtin_amdgcn_s_sleep_var(x);
+
+  if (!__builtin_amdgcn_processor_is("gfx906"))
+    __builtin_amdgcn_s_wait_event_export_ready();
+  else if (__builtin_amdgcn_processor_is("gfx1010") ||
+           __builtin_amdgcn_processor_is("gfx1101"))
+    __builtin_amdgcn_s_ttracedata_imm(1);
+
+  while (__builtin_amdgcn_processor_is("gfx1101")) *p += x;
+
+  do {
+    break;
+  } while (__builtin_amdgcn_processor_is("gfx1010"));
+
+  for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break;
+
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready))
+    __builtin_amdgcn_s_wait_event_export_ready();
+  else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm))
+    __builtin_amdgcn_s_ttracedata_imm(1);
+
+  do {
+    break;
+  } while (
+      __builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
+
+  for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p)
+    break;
+
+**Description**:
+
+The builtins return a value of type ``__amdgpu_feature_predicate_t``, which is a
+target specific type that behaves as if its C++ definition was the following:
+
+.. code-block:: c++
+
+  struct __amdgpu_feature_predicate_t {
+    __amdgpu_feature_predicate_t() = delete;
+    __amdgpu_feature_predicate_t(const __amdgpu_feature_predicate_t&) = delete;
+    __amdgpu_feature_predicate_t(__amdgpu_feature_predicate_t&&) = delete;
+
+    explicit
+    operator bool() const noexcept;
+  };
+
+The builtins can be used in C as well, wherein the
+``__amdgpu_feature_predicate_t`` type behaves as an opaque, forward declared
+type with conditional automated conversion to ``_Bool`` when used as the
+predicate argument to a control structure:
+
+.. code-block:: c
+
+  struct __amdgpu_feature_predicate_t ret();     // Error
+  void arg(struct __amdgpu_feature_predicate_t); // Error
+  void local() {
+    struct __amdgpu_feature_predicate_t x;       // Error
+    struct __amdgpu_feature_predicate_t y =
+        __builtin_amdgcn_processor_is("gfx900"); // Error
+  }
+  void valid_use() {
+    _Bool x = (_Bool)__builtin_amdgcn_processor_is("gfx900"); // OK
+    if (__builtin_amdgcn_processor_is("gfx900"))       // Implicit cast to _Bool
+      return;
+    for (; __builtin_amdgcn_processor_is("gfx900");)   // Implicit cast to _Bool
+      break;
+    while (__builtin_amdgcn_processor_is("gfx900"))    // Implicit cast to _Bool
+      break;
+    do {
+      break;
+    } while (__builtin_amdgcn_processor_is("gfx900")); // Implicit cast to _Bool
+
+    __builtin_amdgcn_processor_is("gfx900") ? x : !x;
+  }
+
+The boolean interpretation of the predicate values returned by the builtins:
+
+* indicates whether the current target matches the argument; the argument MUST
+  be a string literal and a valid AMDGPU target
+* indicates whether the builtin function passed as the argument can be invoked
+  by the current target; the argument MUST be either a generic or AMDGPU
+  specific builtin name
+
+When invoked while compiling for a concrete target, the builtins are evaluated
+early by Clang, and never produce any CodeGen effects / have no observable
+side-effects in IR. Conversely, when compiling for AMDGCN flavoured SPIR-v,
+which is an abstract target, a series of specialization constants are implicitly
+created, in correspondence with the predicates. These predicates get resolved
+when finalizing the compilation process for a concrete target, and shall reflect
+the latter's identity and features. Thus, it is possible to author high-level
+code, in e.g. HIP, that is target adaptive in a dynamic fashion, contrary to
+macro based mechanisms.
+
 __builtin_amdgcn_ballot_w{32,64}
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@ -458,7 +458,10 @@ Target Specific Changes

 AMDGPU Support
 ^^^^^^^^^^^^^^
-
+- Introduced a new target specific builtin ``__builtin_amdgcn_processor_is``,
+  a late / deferred query for the current target processor.
+- Introduced a new target specific builtin ``__builtin_amdgcn_is_invocable``,
+  a late / deferred query for the availability of target specific builtins.
 - Initial support for gfx1310

 NVPTX Support
--- a/clang/include/clang/Basic/AMDGPUTypes.def
+++ b/clang/include/clang/Basic/AMDGPUTypes.def
@ -20,11 +20,19 @@
  AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)
 #endif

+#ifndef AMDGPU_FEATURE_PREDICATE_TYPE
+#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \
+  AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)
+#endif
+
 AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_buffer_rsrc_t", AMDGPUBufferRsrc, AMDGPUBufferRsrcTy, 128, 128, 8)
 AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_texture_t", AMDGPUTexture, AMDGPUTextureTy, 256, 256, 0)

 AMDGPU_NAMED_BARRIER_TYPE("__amdgpu_named_workgroup_barrier_t", AMDGPUNamedWorkgroupBarrier, AMDGPUNamedWorkgroupBarrierTy, 128, 32, 0)

+AMDGPU_FEATURE_PREDICATE_TYPE("__amdgpu_feature_predicate_t", AMDGPUFeaturePredicate, AMDGPUFeaturePredicateTy, 1, 1)
+
 #undef AMDGPU_TYPE
 #undef AMDGPU_OPAQUE_PTR_TYPE
 #undef AMDGPU_NAMED_BARRIER_TYPE
+#undef AMDGPU_FEATURE_PREDICATE_TYPE
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@ -34,6 +34,7 @@
 //  Q -> target builtin type, followed by a character to distinguish the builtin type
 //    Qa -> AArch64 svcount_t builtin type.
 //    Qb -> AMDGPU __amdgpu_buffer_rsrc_t builtin type.
+//    Qc -> AMDGPU __amdgpu_feature_predicate_t builtin type.
 //    Qt -> AMDGPU __amdgpu_texture_t builtin type.
 //  E -> ext_vector, followed by the number of elements and the base type.
 //  X -> _Complex, followed by the base type.
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@ -523,6 +523,11 @@ def __builtin_amdgcn_endpgm : AMDGPUBuiltin<"void()", [NoReturn]>;
 def __builtin_amdgcn_get_fpenv : AMDGPUBuiltin<"uint64_t()">;
 def __builtin_amdgcn_set_fpenv : AMDGPUBuiltin<"void(uint64_t)">;

+// These are special FE only builtins intended for forwarding the requirements
+// to the ME.
+def __builtin_amdgcn_processor_is : AMDGPUBuiltin<"__amdgpu_feature_predicate_t(char const *)", [NoThrow, Const, CustomTypeChecking, UnevaluatedArguments]>;
+def __builtin_amdgcn_is_invocable : AMDGPUBuiltin<"__amdgpu_feature_predicate_t()", [NoThrow, Const, CustomTypeChecking, UnevaluatedArguments]>;
+
 //===----------------------------------------------------------------------===//

 // Wave Reduction builtins.
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@ -1927,3 +1927,6 @@ def TrivialAutoVarInit : DiagGroup<"trivial-auto-var-init">;

 // A warning for options that enable a feature that is not yet complete
 def ExperimentalOption : DiagGroup<"experimental-option">;
+
+// Warnings about unguarded usages of AMDGPU target specific constructs
+def UnguardedBuiltinUsageAMDGPU : DiagGroup<"amdgpu-unguarded-builtin-usage">;
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@ -14117,6 +14117,47 @@ def note_acc_reduction_combiner_forming
 // AMDGCN builtins diagnostics
 def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
 def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
+def err_amdgcn_processor_is_arg_not_literal
+    : Error<"the argument to __builtin_amdgcn_processor_is must be a string "
+            "literal">;
+def err_amdgcn_processor_is_arg_invalid_value
+    : Error<"the argument to __builtin_amdgcn_processor_is must be a valid "
+            "AMDGCN processor identifier; '%0' is not valid">;
+def note_amdgcn_processor_is_valid_options
+    : Note<"valid AMDGCN processor identifiers are: %0">;
+def err_amdgcn_is_invocable_arg_invalid_value
+    : Error<"the argument to __builtin_amdgcn_is_invocable must be either a "
+            "target agnostic builtin or an AMDGCN target specific builtin; '%0'"
+            " is not valid">;
+def err_amdgcn_predicate_type_is_not_constructible
+    : Error<"%0 has type __amdgpu_feature_predicate_t, which is not"
+            " constructible">;
+def err_amdgcn_predicate_type_needs_explicit_bool_cast
+    : Error<"%0 must be explicitly cast to %1; however, please note that this "
+            "is almost always an error and that it prevents the effective "
+            "guarding of target dependent code, and thus should be avoided">;
+def note_amdgcn_protected_by_predicate : Note<"jump enters statement controlled"
+                                              " by AMDGPU feature predicate">;
+def err_amdgcn_conflicting_is_processor_options
+    : Error<"conflicting check for AMDGCN processor %0 found in a scope already"
+            " controlled by a check for AMDGCN processor">;
+def note_amdgcn_previous_is_processor_guard
+    : Note<"predicate guard, with establishes the context, inserted here">;
+def warn_amdgcn_unguarded_asm_stmt
+    : Warning<"the '%0' ASM sequence might be invalid for some AMDGPU targets">,
+    InGroup<UnguardedBuiltinUsageAMDGPU>, DefaultIgnore;
+def note_amdgcn_unguarded_asm_silence
+  : Note<"enclose the '%0' ASM sequence in a scope controlled by a "
+         "__builtin_amdgcn_is_processor check to silence this warning">;
+def err_amdgcn_incompatible_builtin
+  : Error<"%0 cannot be invoked in the current context, as it requires the "
+          "'%1' feature(s)%select{|, which '%3' does not provide}2">;
+def warn_amdgcn_unguarded_builtin :
+  Warning<"%0 might be unavailable on some AMDGPU targets">,
+  InGroup<UnguardedBuiltinUsageAMDGPU>, DefaultIgnore;
+def note_amdgcn_unguarded_builtin_silence
+  : Note<"enclose %0 in a __builtin_amdgcn_is_invocable check to silence "
+         "this warning">;

 def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">;

--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@ -15,12 +15,17 @@

 #include "clang/AST/ASTFwd.h"
 #include "clang/Sema/SemaBase.h"
+#include "llvm/ADT/SmallPtrSet.h"

 namespace clang {
 class AttributeCommonInfo;
+class Expr;
 class ParsedAttr;

 class SemaAMDGPU : public SemaBase {
+  llvm::SmallPtrSet<Expr *, 32> ExpandedPredicates;
+  llvm::SmallPtrSet<FunctionDecl *, 32> PotentiallyUnguardedBuiltinUsers;
+
 public:
  SemaAMDGPU(Sema &S);

@ -73,6 +78,16 @@ public:
  void handleAMDGPUNumVGPRAttr(Decl *D, const ParsedAttr &AL);
  void handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, const ParsedAttr &AL);
  void handleAMDGPUFlatWorkGroupSizeAttr(Decl *D, const ParsedAttr &AL);
+
+  /// Expand a valid use of the feature identification builtins into its
+  /// corresponding sequence of instructions.
+  Expr *ExpandAMDGPUPredicateBuiltIn(Expr *CE);
+  bool IsPredicate(Expr *E) const;
+  /// Diagnose unguarded usages of AMDGPU builtins and recommend guarding with
+  /// __builtin_amdgcn_is_invocable
+  void AddPotentiallyUnguardedBuiltinUser(FunctionDecl *FD);
+  bool HasPotentiallyUnguardedBuiltinUsage(FunctionDecl *FD) const;
+  void DiagnoseUnguardedBuiltinUsage(FunctionDecl *FD);
 };
 } // namespace clang

--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@ -1166,7 +1166,7 @@ enum PredefinedTypeIDs {
 ///
 /// Type IDs for non-predefined types will start at
 /// NUM_PREDEF_TYPE_IDs.
-const unsigned NUM_PREDEF_TYPE_IDS = 514;
+const unsigned NUM_PREDEF_TYPE_IDS = 515;

 // Ensure we do not overrun the predefined types we reserved
 // in the enum PredefinedTypeIDs above.
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@ -1426,7 +1426,12 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
  }

  if (Target.getTriple().isAMDGPU() ||
-      (AuxTarget && AuxTarget->getTriple().isAMDGPU())) {
+      (Target.getTriple().isSPIRV() &&
+       Target.getTriple().getVendor() == llvm::Triple::AMD) ||
+      (AuxTarget &&
+       (AuxTarget->getTriple().isAMDGPU() ||
+        ((AuxTarget->getTriple().isSPIRV() &&
+          AuxTarget->getTriple().getVendor() == llvm::Triple::AMD))))) {
 #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
  InitBuiltinType(SingletonId, BuiltinType::Id);
 #include "clang/Basic/AMDGPUTypes.def"
@ -12739,6 +12744,10 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
      Type = Context.AMDGPUBufferRsrcTy;
      break;
    }
+    case 'c': {
+      Type = Context.AMDGPUFeaturePredicateTy;
+      break;
+    }
    case 't': {
      Type = Context.AMDGPUTextureTy;
      break;
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@ -2585,6 +2585,8 @@ bool Type::isSizelessBuiltinType() const {
      // HLSL intangible types
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
+      // AMDGPU feature predicate type
+    case BuiltinType::AMDGPUFeaturePredicate:
      return true;
    default:
      return false;
--- a/clang/lib/Basic/Targets/SPIR.cpp
+++ b/clang/lib/Basic/Targets/SPIR.cpp
@ -181,3 +181,12 @@ void SPIRV64AMDGCNTargetInfo::setAuxTarget(const TargetInfo *Aux) {
    Float128Format = DoubleFormat;
  }
 }
+
+bool SPIRV64AMDGCNTargetInfo::isValidCPUName(StringRef CPU) const {
+  return AMDGPUTI.isValidCPUName(CPU);
+}
+
+void SPIRV64AMDGCNTargetInfo::fillValidCPUList(
+    SmallVectorImpl<StringRef> &Values) const {
+  return AMDGPUTI.fillValidCPUList(Values);
+}
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@ -480,6 +480,11 @@ public:
  }

  bool hasInt128Type() const override { return TargetInfo::hasInt128Type(); }
+
+  // This is only needed for validating arguments passed to
+  // __builtin_amdgcn_processor_is
+  bool isValidCPUName(StringRef Name) const override;
+  void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;
 };

 class LLVM_LIBRARY_VISIBILITY SPIRV64IntelTargetInfo final
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@ -1105,6 +1105,13 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
          DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_unsigned); \
    return SingletonId;                                                        \
  }
+#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align)     \
+  case BuiltinType::Id: {                                                      \
+    if (!SingletonId)                                                          \
+      SingletonId =                                                            \
+          DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_boolean);  \
+    return SingletonId;                                                        \
+  }
 #include "clang/Basic/AMDGPUTypes.def"
  case BuiltinType::UChar:
  case BuiltinType::Char_U:
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@ -1040,6 +1040,10 @@ Value *ScalarExprEmitter::EmitConversionToBool(Value *Src, QualType SrcType) {
  if (const MemberPointerType *MPT = dyn_cast<MemberPointerType>(SrcType))
    return CGF.CGM.getCXXABI().EmitMemberPointerIsNotNull(CGF, Src, MPT);

+  // The conversion is a NOP, and will be done when CodeGening the builtin.
+  if (SrcType == CGF.getContext().AMDGPUFeaturePredicateTy)
+    return Src;
+
  assert((SrcType->isIntegerType() || isa<llvm::PointerType>(Src->getType())) &&
         "Unknown scalar type to convert");

--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@ -610,6 +610,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
  case BuiltinType::Id:                                                        \
    return llvm::TargetExtType::get(getLLVMContext(), "amdgcn.named.barrier",  \
                                    {}, {Scope});
+#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align)     \
+  case BuiltinType::Id:                                                        \
+    return ConvertType(getContext().getLogicalOperationType());
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/AtomicOrdering.h"
@ -458,6 +459,19 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
  Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
 }

+static Value *GetAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) {
+  Constant *SpecId = ConstantInt::getAllOnesValue(CGF.Int32Ty);
+
+  LLVMContext &Ctx = CGF.getLLVMContext();
+  MDNode *Predicate = MDNode::get(Ctx, MDString::get(Ctx, Name.str()));
+  std::vector<Value *> Args = {SpecId, ConstantInt::getFalse(Ctx),
+                               MetadataAsValue::get(Ctx, Predicate)};
+  CallInst *Call = CGF.Builder.CreateIntrinsic(
+      Intrinsic::spv_named_boolean_spec_constant, Args);
+
+  return Call;
+}
+
 static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
  switch (BuiltinID) {
  default:
@ -1014,6 +1028,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
    llvm::Value *Env = EmitScalarExpr(E->getArg(0));
    return Builder.CreateCall(F, {Env});
  }
+  case AMDGPU::BI__builtin_amdgcn_processor_is: {
+    assert(CGM.getTriple().isSPIRV() &&
+           "__builtin_amdgcn_processor_is should never reach CodeGen for "
+           "concrete targets!");
+    StringRef Proc = cast<clang::StringLiteral>(E->getArg(0))->getString();
+    return GetAMDGPUPredicate(*this, "is." + Proc);
+  }
+  case AMDGPU::BI__builtin_amdgcn_is_invocable: {
+    assert(CGM.getTriple().isSPIRV() &&
+           "__builtin_amdgcn_is_invocable should never reach CodeGen for "
+           "concrete targets!");
+    auto *FD = cast<FunctionDecl>(
+        cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
+    StringRef RF =
+        getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
+    return GetAMDGPUPredicate(*this, "has." + RF);
+  }
  case AMDGPU::BI__builtin_amdgcn_read_exec:
    return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
  case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
--- a/clang/lib/Sema/JumpDiagnostics.cpp
+++ b/clang/lib/Sema/JumpDiagnostics.cpp
@ -19,6 +19,7 @@
 #include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/Basic/SourceLocation.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/BitVector.h"
 using namespace clang;
@ -369,8 +370,10 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S,

  case Stmt::IfStmtClass: {
    IfStmt *IS = cast<IfStmt>(S);
+    bool AMDGPUPredicate = false;
    if (!(IS->isConstexpr() || IS->isConsteval() ||
-          IS->isObjCAvailabilityCheck()))
+          IS->isObjCAvailabilityCheck() ||
+          (AMDGPUPredicate = this->S.AMDGPU().IsPredicate(IS->getCond()))))
      break;

    unsigned Diag = diag::note_protected_by_if_available;
@ -378,6 +381,8 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S,
      Diag = diag::note_protected_by_constexpr_if;
    else if (IS->isConsteval())
      Diag = diag::note_protected_by_consteval_if;
+    else if (AMDGPUPredicate)
+      Diag = diag::note_amdgcn_protected_by_predicate;

    if (VarDecl *Var = IS->getConditionVariable())
      BuildScopeInformation(Var, ParentScope);
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@ -568,8 +568,13 @@ void Sema::Initialize() {
  }

  if (Context.getTargetInfo().getTriple().isAMDGPU() ||
+      (Context.getTargetInfo().getTriple().isSPIRV() &&
+       Context.getTargetInfo().getTriple().getVendor() == llvm::Triple::AMD) ||
      (Context.getAuxTargetInfo() &&
-       Context.getAuxTargetInfo()->getTriple().isAMDGPU())) {
+       (Context.getAuxTargetInfo()->getTriple().isAMDGPU() ||
+        (Context.getAuxTargetInfo()->getTriple().isSPIRV() &&
+         Context.getAuxTargetInfo()->getTriple().getVendor() ==
+             llvm::Triple::AMD)))) {
 #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
  addImplicitTypedef(Name, Context.SingletonId);
 #include "clang/Basic/AMDGPUTypes.def"
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@ -11,14 +11,24 @@
 //===----------------------------------------------------------------------===//

 #include "clang/Sema/SemaAMDGPU.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DynamicRecursiveASTVisitor.h"
+#include "clang/AST/Expr.h"
 #include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "clang/Basic/TargetInfo.h"
 #include "clang/Sema/Ownership.h"
+#include "clang/Sema/Scope.h"
 #include "clang/Sema/Sema.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/AtomicOrdering.h"
+#include "llvm/TargetParser/TargetParser.h"
 #include <cstdint>
+#include <utility>

 namespace clang {

@ -742,4 +752,298 @@ void SemaAMDGPU::handleAMDGPUMaxNumWorkGroupsAttr(Decl *D,
  addAMDGPUMaxNumWorkGroupsAttr(D, AL, AL.getArgAsExpr(0), YExpr, ZExpr);
 }

+Expr *SemaAMDGPU::ExpandAMDGPUPredicateBuiltIn(Expr *E) {
+  CallExpr *CE = cast<CallExpr>(E->IgnoreParens());
+  ASTContext &Ctx = getASTContext();
+  QualType BoolTy = Ctx.getLogicalOperationType();
+  llvm::APInt False = llvm::APInt::getZero(Ctx.getIntWidth(BoolTy));
+  llvm::APInt True = llvm::APInt::getAllOnes(Ctx.getIntWidth(BoolTy));
+  SourceLocation Loc = CE->getExprLoc();
+
+  if (!CE->getBuiltinCallee())
+    return *ExpandedPredicates
+                .insert(IntegerLiteral::Create(Ctx, False, BoolTy, Loc))
+                .first;
+
+  bool P = false;
+  unsigned BI = CE->getBuiltinCallee();
+  if (Ctx.BuiltinInfo.isAuxBuiltinID(BI))
+    BI = Ctx.BuiltinInfo.getAuxBuiltinID(BI);
+
+  if (BI == AMDGPU::BI__builtin_amdgcn_processor_is) {
+    auto *GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
+    if (!GFX) {
+      Diag(Loc, diag::err_amdgcn_processor_is_arg_not_literal);
+      return nullptr;
+    }
+
+    StringRef N = GFX->getString();
+    const TargetInfo &TI = Ctx.getTargetInfo();
+    const TargetInfo *AuxTI = Ctx.getAuxTargetInfo();
+    if (!TI.isValidCPUName(N) && (!AuxTI || !AuxTI->isValidCPUName(N))) {
+      Diag(Loc, diag::err_amdgcn_processor_is_arg_invalid_value) << N;
+      SmallVector<StringRef, 32> ValidList;
+      if (TI.getTriple().getVendor() == llvm::Triple::VendorType::AMD)
+        TI.fillValidCPUList(ValidList);
+      else if (AuxTI) // Since the BI is present it must be an AMDGPU triple.
+        AuxTI->fillValidCPUList(ValidList);
+      if (!ValidList.empty())
+        Diag(Loc, diag::note_amdgcn_processor_is_valid_options)
+            << llvm::join(ValidList, ", ");
+      return nullptr;
+    }
+    if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
+      CE->setType(BoolTy);
+      return *ExpandedPredicates.insert(CE).first;
+    }
+
+    if (auto TID = Ctx.getTargetInfo().getTargetID())
+      P = TID->find(N) == 0;
+  } else {
+    Expr *Arg = CE->getArg(0);
+    if (!Arg || Arg->getType() != Ctx.BuiltinFnTy) {
+      Diag(Loc, diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg;
+      return nullptr;
+    }
+
+    if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
+      CE->setType(BoolTy);
+      return *ExpandedPredicates.insert(CE).first;
+    }
+
+    auto *FD = cast<FunctionDecl>(Arg->getReferencedDeclOfCallee());
+
+    StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
+    llvm::StringMap<bool> CF;
+    Ctx.getFunctionFeatureMap(CF, FD);
+
+    P = Builtin::evaluateRequiredTargetFeatures(RF, CF);
+  }
+
+  return *ExpandedPredicates
+              .insert(
+                  IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc))
+              .first;
+}
+
+bool SemaAMDGPU::IsPredicate(Expr *E) const {
+  return ExpandedPredicates.contains(E);
+}
+
+void SemaAMDGPU::AddPotentiallyUnguardedBuiltinUser(FunctionDecl *FD) {
+  PotentiallyUnguardedBuiltinUsers.insert(FD);
+}
+
+bool SemaAMDGPU::HasPotentiallyUnguardedBuiltinUsage(FunctionDecl *FD) const {
+  return PotentiallyUnguardedBuiltinUsers.contains(FD);
+}
+
+namespace {
+/// This class implements -Wamdgpu-unguarded-builtin-usage.
+///
+/// This is done with a traversal of the AST of a function that includes a
+/// call to a target specific builtin. Whenever we encounter an \c if of the
+/// form: \c if(__builtin_amdgcn_is_invocable), we consider the then statement
+/// guarded.
+class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor {
+  // TODO: this could eventually be extended to consider what happens when there
+  //       are multiple target architectures specified via target("arch=gfxXXX")
+  //       target("arch=gfxyyy") etc., as well as feature disabling via "-XXX".
+  Sema &SemaRef;
+
+  SmallVector<StringRef> TargetFeatures;
+  SmallVector<std::pair<SourceLocation, StringRef>> CurrentGFXIP;
+  SmallVector<unsigned> GuardedBuiltins;
+
+  static Expr *FindPredicate(Expr *Cond) {
+    if (auto *CE = dyn_cast<CallExpr>(Cond)) {
+      if (CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_is_invocable ||
+          CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_processor_is)
+        return Cond;
+    } else if (auto *UO = dyn_cast<UnaryOperator>(Cond)) {
+      return FindPredicate(UO->getSubExpr());
+    } else if (auto *BO = dyn_cast<BinaryOperator>(Cond)) {
+      if ((Cond = FindPredicate(BO->getLHS())))
+        return Cond;
+      return FindPredicate(BO->getRHS());
+    }
+    return nullptr;
+  }
+
+  bool EnterPredicateGuardedContext(CallExpr *P);
+  void ExitPredicateGuardedContext(bool WasProcessorCheck);
+  bool TraverseGuardedStmt(Stmt *S, CallExpr *P);
+
+public:
+  DiagnoseUnguardedBuiltins(Sema &SemaRef) : SemaRef(SemaRef) {
+    if (auto *TAT = SemaRef.getCurFunctionDecl(true)->getAttr<TargetAttr>()) {
+      // We use the somewhat misnamed x86 accessors because they provide exactly
+      // what we require.
+      TAT->getX86AddedFeatures(TargetFeatures);
+      if (auto GFXIP = TAT->getX86Architecture())
+        CurrentGFXIP.emplace_back(TAT->getLocation(), *GFXIP);
+    }
+  }
+
+  bool TraverseLambdaExpr(LambdaExpr *LE) override {
+    if (SemaRef.AMDGPU().HasPotentiallyUnguardedBuiltinUsage(
+            LE->getCallOperator()))
+      return true; // We have already handled this.
+    return DynamicRecursiveASTVisitor::TraverseLambdaExpr(LE);
+  }
+
+  bool TraverseStmt(Stmt *S) override {
+    if (!S)
+      return true;
+    return DynamicRecursiveASTVisitor::TraverseStmt(S);
+  }
+
+  void IssueDiagnostics(Stmt *S) { TraverseStmt(S); }
+
+  bool TraverseIfStmt(IfStmt *If) override {
+    if (auto *CE = dyn_cast_or_null<CallExpr>(FindPredicate(If->getCond())))
+      return TraverseGuardedStmt(If, CE);
+    return DynamicRecursiveASTVisitor::TraverseIfStmt(If);
+  }
+
+  bool TraverseCaseStmt(CaseStmt *CS) override {
+    return TraverseStmt(CS->getSubStmt());
+  }
+
+  bool TraverseConditionalOperator(ConditionalOperator *CO) override {
+    if (auto *CE = dyn_cast_or_null<CallExpr>(FindPredicate(CO->getCond())))
+      return TraverseGuardedStmt(CO, CE);
+    return DynamicRecursiveASTVisitor::TraverseConditionalOperator(CO);
+  }
+
+  bool VisitAsmStmt(AsmStmt *ASM) override;
+  bool VisitCallExpr(CallExpr *CE) override;
+};
+
+bool DiagnoseUnguardedBuiltins::EnterPredicateGuardedContext(CallExpr *P) {
+  bool IsProcessorCheck =
+      P->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_processor_is;
+
+  if (IsProcessorCheck) {
+    StringRef G = cast<clang::StringLiteral>(P->getArg(0))->getString();
+    // TODO: handle generic ISAs.
+    if (!CurrentGFXIP.empty() && G != CurrentGFXIP.back().second) {
+      SemaRef.Diag(P->getExprLoc(),
+                   diag::err_amdgcn_conflicting_is_processor_options)
+          << P;
+      SemaRef.Diag(CurrentGFXIP.back().first,
+                   diag::note_amdgcn_previous_is_processor_guard);
+    }
+    CurrentGFXIP.emplace_back(P->getExprLoc(), G);
+  } else {
+    auto *FD = cast<FunctionDecl>(
+        cast<DeclRefExpr>(P->getArg(0))->getReferencedDeclOfCallee());
+    GuardedBuiltins.push_back(FD->getBuiltinID());
+  }
+
+  return IsProcessorCheck;
+}
+
+void DiagnoseUnguardedBuiltins::ExitPredicateGuardedContext(bool WasProcCheck) {
+  if (WasProcCheck)
+    CurrentGFXIP.pop_back();
+  else
+    GuardedBuiltins.pop_back();
+}
+
+inline std::pair<Stmt *, Stmt *> GetTraversalOrder(Stmt *S) {
+  std::pair<Stmt *, Stmt *> Ordered;
+  Expr *Condition = nullptr;
+
+  if (auto *CO = dyn_cast<ConditionalOperator>(S)) {
+    Condition = CO->getCond();
+    Ordered = {CO->getTrueExpr(), CO->getFalseExpr()};
+  } else if (auto *If = dyn_cast<IfStmt>(S)) {
+    Condition = If->getCond();
+    Ordered = {If->getThen(), If->getElse()};
+  }
+
+  if (auto *UO = dyn_cast<UnaryOperator>(Condition))
+    if (UO->getOpcode() == UnaryOperatorKind::UO_LNot)
+      std::swap(Ordered.first, Ordered.second);
+
+  return Ordered;
+}
+
+bool DiagnoseUnguardedBuiltins::TraverseGuardedStmt(Stmt *S, CallExpr *P) {
+  assert(S && "Unexpected missing Statement!");
+  assert(P && "Unexpected missing Predicate!");
+
+  auto [Guarded, Unguarded] = GetTraversalOrder(S);
+
+  bool WasProcessorCheck = EnterPredicateGuardedContext(P);
+
+  bool Continue = TraverseStmt(Guarded);
+
+  ExitPredicateGuardedContext(WasProcessorCheck);
+
+  return Continue && TraverseStmt(Unguarded);
+}
+
+bool DiagnoseUnguardedBuiltins::VisitAsmStmt(AsmStmt *ASM) {
+  // TODO: should we check if the ASM is valid for the target? Can we?
+  if (!CurrentGFXIP.empty())
+    return true;
+
+  std::string S = ASM->generateAsmString(SemaRef.getASTContext());
+  SemaRef.Diag(ASM->getAsmLoc(), diag::warn_amdgcn_unguarded_asm_stmt) << S;
+  SemaRef.Diag(ASM->getAsmLoc(), diag::note_amdgcn_unguarded_asm_silence) << S;
+
+  return true;
+}
+
+bool DiagnoseUnguardedBuiltins::VisitCallExpr(CallExpr *CE) {
+  unsigned ID = CE->getBuiltinCallee();
+  Builtin::Context &BInfo = SemaRef.getASTContext().BuiltinInfo;
+
+  if (!ID)
+    return true;
+  if (!BInfo.isTSBuiltin(ID))
+    return true;
+  if (ID == AMDGPU::BI__builtin_amdgcn_processor_is ||
+      ID == AMDGPU::BI__builtin_amdgcn_is_invocable)
+    return true;
+  if (llvm::find(GuardedBuiltins, ID) != GuardedBuiltins.end())
+    return true;
+
+  StringRef FL(BInfo.getRequiredFeatures(ID));
+  llvm::StringMap<bool> FeatureMap;
+  if (CurrentGFXIP.empty()) {
+    for (auto &&F : TargetFeatures)
+      FeatureMap[F] = true;
+    for (auto &&GID : GuardedBuiltins)
+      for (auto &&F : llvm::split(BInfo.getRequiredFeatures(GID), ','))
+        FeatureMap[F] = true;
+  } else {
+    static const llvm::Triple AMDGCN("amdgcn-amd-amdhsa");
+    llvm::AMDGPU::fillAMDGPUFeatureMap(CurrentGFXIP.back().second, AMDGCN,
+                                       FeatureMap);
+  }
+
+  FunctionDecl *BI = CE->getDirectCallee();
+  SourceLocation BICallLoc = CE->getExprLoc();
+  if (Builtin::evaluateRequiredTargetFeatures(FL, FeatureMap)) {
+    SemaRef.Diag(BICallLoc, diag::warn_amdgcn_unguarded_builtin) << BI;
+    SemaRef.Diag(BICallLoc, diag::note_amdgcn_unguarded_builtin_silence) << BI;
+  } else {
+    StringRef GFXIP = CurrentGFXIP.empty() ? "" : CurrentGFXIP.back().second;
+    SemaRef.Diag(BICallLoc, diag::err_amdgcn_incompatible_builtin)
+        << BI << FL << !CurrentGFXIP.empty() << GFXIP;
+    if (!CurrentGFXIP.empty())
+      SemaRef.Diag(CurrentGFXIP.back().first,
+                   diag::note_amdgcn_previous_is_processor_guard);
+  }
+
+  return true;
+}
+} // Unnamed namespace
+
+void SemaAMDGPU::DiagnoseUnguardedBuiltinUsage(FunctionDecl *FD) {
+  DiagnoseUnguardedBuiltins(SemaRef).IssueDiagnostics(FD->getBody());
+}
 } // namespace clang
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@ -23,6 +23,7 @@
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/Initialization.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaHLSL.h"
 #include "clang/Sema/SemaObjC.h"
 #include "clang/Sema/SemaRISCV.h"
@ -1592,6 +1593,13 @@ static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr,
    return TC_Success;
  }

+  if (SrcType == Self.Context.AMDGPUFeaturePredicateTy &&
+      DestType == Self.Context.getLogicalOperationType()) {
+    SrcExpr = Self.AMDGPU().ExpandAMDGPUPredicateBuiltIn(SrcExpr.get());
+    Kind = CK_NoOp;
+    return TC_Success;
+  }
+
  // We tried everything. Everything! Nothing works! :-(
  return TC_NotApplicable;
 }
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@ -47,6 +47,7 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaARM.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaHLSL.h"
@ -13940,6 +13941,16 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
    return;
  }

+  // __amdgpu_feature_predicate_t cannot be initialised
+  if (VDecl->getType().getDesugaredType(Context) ==
+      Context.AMDGPUFeaturePredicateTy) {
+    Diag(VDecl->getLocation(),
+         diag::err_amdgcn_predicate_type_is_not_constructible)
+        << VDecl;
+    VDecl->setInvalidDecl();
+    return;
+  }
+
  // WebAssembly tables can't be used to initialise a variable.
  if (!Init->getType().isNull() && Init->getType()->isWebAssemblyTableType()) {
    Diag(Init->getExprLoc(), diag::err_wasm_table_art) << 0;
@ -14462,6 +14473,13 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) {
  if (VarDecl *Var = dyn_cast<VarDecl>(RealDecl)) {
    QualType Type = Var->getType();

+    if (Type.getDesugaredType(Context) == Context.AMDGPUFeaturePredicateTy) {
+      Diag(Var->getLocation(),
+           diag::err_amdgcn_predicate_type_is_not_constructible)
+          << Var;
+      Var->setInvalidDecl();
+      return;
+    }
    // C++1z [dcl.dcl]p1 grammar implies that an initializer is mandatory.
    if (isa<DecompositionDecl>(RealDecl)) {
      Diag(Var->getLocation(), diag::err_decomp_decl_requires_init) << Var;
@ -17021,8 +17039,12 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body, bool IsInstantiation,
      return nullptr;
    }

-    if (Body && FSI->HasPotentialAvailabilityViolations)
-      DiagnoseUnguardedAvailabilityViolations(dcl);
+    if (Body) {
+      if (FSI->HasPotentialAvailabilityViolations)
+        DiagnoseUnguardedAvailabilityViolations(dcl);
+      else if (AMDGPU().HasPotentiallyUnguardedBuiltinUsage(FD))
+        AMDGPU().DiagnoseUnguardedBuiltinUsage(FD);
+    }

    assert(!FSI->ObjCShouldCallSuper &&
           "This should only be set for ObjC methods, which should have been "
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@ -53,6 +53,7 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaARM.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaFixItUtils.h"
@ -6779,6 +6780,22 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
  if (Result.isInvalid()) return ExprError();
  Fn = Result.get();

+  // The __builtin_amdgcn_is_invocable builtin is special, and will be resolved
+  // later, when we check boolean conditions, for now we merely forward it
+  // without any additional checking.
+  if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1 &&
+      ArgExprs[0]->getType() == Context.BuiltinFnTy) {
+    const auto *FD = cast<FunctionDecl>(Fn->getReferencedDeclOfCallee());
+
+    if (FD->getName() == "__builtin_amdgcn_is_invocable") {
+      QualType FnPtrTy = Context.getPointerType(FD->getType());
+      Expr *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
+      return CallExpr::Create(
+          Context, R, ArgExprs, Context.AMDGPUFeaturePredicateTy,
+          ExprValueKind::VK_PRValue, RParenLoc, FPOptionsOverride());
+    }
+  }
+
  if (CheckArgsForPlaceholders(ArgExprs))
    return ExprError();

@ -6897,6 +6914,15 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,

    FunctionDecl *FDecl = dyn_cast<FunctionDecl>(NDecl);
    if (FDecl && FDecl->getBuiltinID()) {
+      const llvm::Triple &Triple = Context.getTargetInfo().getTriple();
+      if (Triple.isSPIRV() && Triple.getVendor() == llvm::Triple::AMD) {
+        if (Context.BuiltinInfo.isTSBuiltin(FDecl->getBuiltinID()) &&
+            !Context.BuiltinInfo.isAuxBuiltinID(FDecl->getBuiltinID())) {
+          AMDGPU().AddPotentiallyUnguardedBuiltinUser(cast<FunctionDecl>(
+              getFunctionLevelDeclContext(/*AllowLambda=*/true)));
+        }
+      }
+
      // Rewrite the function decl for this builtin by replacing parameters
      // with no explicit address space with the address space of the arguments
      // in ArgExprs.
@ -13905,6 +13931,11 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
    if (RHS.isInvalid())
      return QualType();

+    if (LHS.get()->getType() == Context.AMDGPUFeaturePredicateTy)
+      LHS = AMDGPU().ExpandAMDGPUPredicateBuiltIn(LHS.get());
+    if (RHS.get()->getType() == Context.AMDGPUFeaturePredicateTy)
+      RHS = AMDGPU().ExpandAMDGPUPredicateBuiltIn(RHS.get());
+
    if (!LHS.get()->getType()->isScalarType() ||
        !RHS.get()->getType()->isScalarType())
      return InvalidOperands(Loc, LHS, RHS);
@ -16315,6 +16346,10 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
        // Vector logical not returns the signed variant of the operand type.
        resultType = GetSignedVectorType(resultType);
        break;
+      } else if (resultType == Context.AMDGPUFeaturePredicateTy) {
+        resultType = Context.getLogicalOperationType();
+        Input = AMDGPU().ExpandAMDGPUPredicateBuiltIn(InputExpr);
+        break;
      } else {
        return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
                         << resultType << Input.get()->getSourceRange());
@ -21176,6 +21211,9 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
  E = result.get();

  if (!E->isTypeDependent()) {
+    if (E->getType() == Context.AMDGPUFeaturePredicateTy)
+      return AMDGPU().ExpandAMDGPUPredicateBuiltIn(E);
+
    if (getLangOpts().CPlusPlus)
      return CheckCXXBooleanCondition(E, IsConstexpr); // C++ 6.4p4

--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@ -9244,6 +9244,15 @@ bool InitializationSequence::Diagnose(Sema &S,

  case FK_ConversionFailed: {
    QualType FromType = OnlyArg->getType();
+    // __amdgpu_feature_predicate_t can be explicitly cast to the logical op
+    // type, although this is almost always an error and we advise against it.
+    if (FromType == S.Context.AMDGPUFeaturePredicateTy &&
+        DestType == S.Context.getLogicalOperationType()) {
+      S.Diag(OnlyArg->getExprLoc(),
+             diag::err_amdgcn_predicate_type_needs_explicit_bool_cast)
+          << OnlyArg << DestType;
+      break;
+    }
    PartialDiagnostic PDiag = S.PDiag(diag::err_init_conversion_failed)
      << (int)Entity.getKind()
      << DestType
@ -10056,6 +10065,14 @@ Sema::PerformCopyInitialization(const InitializedEntity &Entity,
  if (EqualLoc.isInvalid())
    EqualLoc = InitE->getBeginLoc();

+  if (Entity.getType().getDesugaredType(Context) ==
+          Context.AMDGPUFeaturePredicateTy &&
+      Entity.getDecl()) {
+    Diag(EqualLoc, diag::err_amdgcn_predicate_type_is_not_constructible)
+        << Entity.getDecl();
+    return ExprError();
+  }
+
  InitializationKind Kind = InitializationKind::CreateCopy(
      InitE->getBeginLoc(), EqualLoc, AllowExplicit);
  InitializationSequence Seq(*this, Entity, Kind, InitE, TopLevelOfInitList);
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@ -30,6 +30,7 @@
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Overload.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaARM.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaObjC.h"
@ -6365,12 +6366,13 @@ TryContextuallyConvertToBool(Sema &S, Expr *From) {
 ExprResult Sema::PerformContextuallyConvertToBool(Expr *From) {
  if (checkPlaceholderForOverload(*this, From))
    return ExprError();
+  if (From->getType() == Context.AMDGPUFeaturePredicateTy)
+    return AMDGPU().ExpandAMDGPUPredicateBuiltIn(From);

  ImplicitConversionSequence ICS = TryContextuallyConvertToBool(*this, From);
  if (!ICS.isBad())
    return PerformImplicitConversion(From, Context.BoolTy, ICS,
                                     AssignmentAction::Converting);
-
  if (!DiagnoseMultipleUserDefinedConversion(From, Context.BoolTy))
    return Diag(From->getBeginLoc(), diag::err_typecheck_bool_condition)
           << From->getType() << From->getSourceRange();
@ -12218,6 +12220,16 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand,
  if (TakingCandidateAddress && !checkAddressOfCandidateIsAvailable(S, Fn))
    return;

+  // __amdgpu_feature_predicate_t can be explicitly cast to the logical op type,
+  // although this is almost always an error and we advise against it.
+  if (FromTy == S.Context.AMDGPUFeaturePredicateTy &&
+      ToTy == S.Context.getLogicalOperationType()) {
+    S.Diag(Conv.Bad.FromExpr->getExprLoc(),
+           diag::err_amdgcn_predicate_type_needs_explicit_bool_cast)
+        << Conv.Bad.FromExpr << ToTy;
+    return;
+  }
+
  // Emit the generic diagnostic and, optionally, add the hints to it.
  PartialDiagnostic FDiag = S.PDiag(diag::note_ovl_candidate_bad_conv);
  FDiag << (unsigned)FnKindPair.first << (unsigned)FnKindPair.second << FnDesc
--- a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@ -0,0 +1,72 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 6
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s
+
+// Test that, depending on triple and, if applicable, target-cpu, one of three
+// things happens:
+//    1) for gfx900 we emit an empty kernel (concrete target, lacks feature)
+//    2) for gfx1010 we emit a call to trap (concrete target, has feature)
+//    3) for AMDGCNSPIRV we emit a boolean specialisation constant, via a call
+//       to __spirv_SpecConstant, with the id of UINT32_MAX, and the boolean
+//       value of false, which will yield an OpSpecConstantFalse in SPIR-V
+
+// AMDGCN-GFX900-LABEL: define dso_local void @foo(
+// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX900-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX900-NEXT:    ret void
+//
+// AMDGCN-GFX1010-LABEL: define dso_local void @foo(
+// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX1010-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX1010-NEXT:    call void @llvm.trap()
+// AMDGCN-GFX1010-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @foo(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata [[META2:![0-9]+]])
+// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[LOR_LHS_FALSE:.*]]
+// AMDGCNSPIRV:       [[LOR_LHS_FALSE]]:
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata [[META3:![0-9]+]])
+// AMDGCNSPIRV-NEXT:    [[TOBOOL1:%.*]] = icmp ne i1 [[TMP1]], false
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL1]], label %[[IF_THEN]], label %[[LOR_LHS_FALSE2:.*]]
+// AMDGCNSPIRV:       [[LOR_LHS_FALSE2]]:
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata [[META4:![0-9]+]])
+// AMDGCNSPIRV-NEXT:    [[TOBOOL3:%.*]] = icmp ne i1 [[TMP2]], false
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL3]], label %[[IF_THEN]], label %[[IF_END:.*]]
+// AMDGCNSPIRV:       [[IF_THEN]]:
+// AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
+// AMDGCNSPIRV-NEXT:    br label %[[IF_END]]
+// AMDGCNSPIRV:       [[IF_END]]:
+// AMDGCNSPIRV-NEXT:    ret void
+//
+void foo() {
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16) ||
+        (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16_var)) ||
+        (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ashr_pk_i8_i32)))
+        return __builtin_trap();
+}
+//.
+// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" }
+//.
+// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" }
+// AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+// AMDGCNSPIRV: attributes #[[ATTR2:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX900: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCNSPIRV: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// AMDGCNSPIRV: [[META2]] = !{!"has.gfx10-insts"}
+// AMDGCNSPIRV: [[META3]] = !{!"has.gfx12-insts"}
+// AMDGCNSPIRV: [[META4]] = !{!"has.ashr-pk-insts"}
+//.
--- a/clang/test/CodeGen/amdgpu-builtin-processor-is.c
+++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
@ -0,0 +1,83 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s
+
+// Test that, depending on triple and, if applicable, target-cpu, one of three
+// things happens:
+//    1) for gfx900 we emit a call to trap (concrete target, matches)
+//    2) for gfx1010 we emit an empty kernel (concrete target, does not match)
+//    3) for AMDGCNSPIRV we emit a boolean specialisation constant, via a call
+//       to __spirv_SpecConstant, with the id of UINT32_MAX, and the boolean
+//       value of false, which will yield an OpSpecConstantFalse in SPIR-V
+// AMDGCN-GFX900-LABEL: define dso_local void @foo(
+// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX900-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX900-NEXT:    call void @llvm.trap()
+// AMDGCN-GFX900-NEXT:    ret void
+//
+// AMDGCN-GFX1010-LABEL: define dso_local void @foo(
+// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX1010-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX1010-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @foo(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata [[META2:![0-9]+]])
+// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[LOR_LHS_FALSE:.*]]
+// AMDGCNSPIRV:       [[LOR_LHS_FALSE]]:
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata [[META3:![0-9]+]])
+// AMDGCNSPIRV-NEXT:    [[TOBOOL1:%.*]] = icmp ne i1 [[TMP1]], false
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL1]], label %[[IF_THEN]], label %[[LOR_LHS_FALSE2:.*]]
+// AMDGCNSPIRV:       [[LOR_LHS_FALSE2]]:
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata [[META4:![0-9]+]])
+// AMDGCNSPIRV-NEXT:    [[TOBOOL3:%.*]] = icmp ne i1 [[TMP2]], false
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL3]], label %[[IF_THEN]], label %[[LOR_LHS_FALSE4:.*]]
+// AMDGCNSPIRV:       [[LOR_LHS_FALSE4]]:
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata [[META5:![0-9]+]])
+// AMDGCNSPIRV-NEXT:    [[TOBOOL5:%.*]] = icmp ne i1 [[TMP3]], false
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL5]], label %[[IF_THEN]], label %[[LOR_LHS_FALSE6:.*]]
+// AMDGCNSPIRV:       [[LOR_LHS_FALSE6]]:
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata [[META6:![0-9]+]])
+// AMDGCNSPIRV-NEXT:    [[TOBOOL7:%.*]] = icmp ne i1 [[TMP4]], false
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL7]], label %[[IF_THEN]], label %[[IF_END:.*]]
+// AMDGCNSPIRV:       [[IF_THEN]]:
+// AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
+// AMDGCNSPIRV-NEXT:    br label %[[IF_END]]
+// AMDGCNSPIRV:       [[IF_END]]:
+// AMDGCNSPIRV-NEXT:    ret void
+//
+void foo() {
+    if (__builtin_amdgcn_processor_is("gfx900") ||
+        __builtin_amdgcn_processor_is("gfx906") ||
+        __builtin_amdgcn_processor_is("gfx90c") ||
+        (__builtin_amdgcn_processor_is("gfx90a")) ||
+        (__builtin_amdgcn_processor_is("gfx942")))
+        return __builtin_trap();
+}
+//.
+// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" }
+// AMDGCN-GFX900: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" }
+//.
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+// AMDGCNSPIRV: attributes #[[ATTR2:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX900: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCNSPIRV: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// AMDGCNSPIRV: [[META2]] = !{!"is.gfx900"}
+// AMDGCNSPIRV: [[META3]] = !{!"is.gfx906"}
+// AMDGCNSPIRV: [[META4]] = !{!"is.gfx90c"}
+// AMDGCNSPIRV: [[META5]] = !{!"is.gfx90a"}
+// AMDGCNSPIRV: [[META6]] = !{!"is.gfx942"}
+//.
--- a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
+++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
@ -0,0 +1,48 @@
+// RUN: not %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - 2>&1 | FileCheck %s
+
+bool predicate(bool x);
+void pass_by_value(__amdgpu_feature_predicate_t x);
+
+void invalid_uses(int *p, int x, const __amdgpu_feature_predicate_t &lv,
+                  __amdgpu_feature_predicate_t &&rv) {
+    // CHECK: error: 'a' has type __amdgpu_feature_predicate_t, which is not constructible
+    __amdgpu_feature_predicate_t a;
+    // CHECK: error: 'b' has type __amdgpu_feature_predicate_t, which is not constructible
+    __amdgpu_feature_predicate_t b = __builtin_amdgcn_processor_is("gfx906");
+    // CHECK: error: 'c' has type __amdgpu_feature_predicate_t, which is not constructible
+    __amdgpu_feature_predicate_t c = lv;
+    // CHECK: error: 'd' has type __amdgpu_feature_predicate_t, which is not constructible
+    __amdgpu_feature_predicate_t d = rv;
+    // CHECK: error: '__builtin_amdgcn_processor_is("gfx906")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+    bool invalid_use_in_init_0 = __builtin_amdgcn_processor_is("gfx906");
+    // CHECK: error: 'x' has type __amdgpu_feature_predicate_t, which is not constructible
+    pass_by_value(__builtin_amdgcn_processor_is("gfx906"));
+    // CHECK: error: '__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+    bool invalid_use_in_init_1 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    // CHECK: error: '__builtin_amdgcn_processor_is("gfx906")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+    if (bool invalid_use_in_init_2 = __builtin_amdgcn_processor_is("gfx906")) return;
+    // CHECK: error: '__builtin_amdgcn_processor_is("gfx1200")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+    if (predicate(__builtin_amdgcn_processor_is("gfx1200"))) __builtin_amdgcn_s_sleep_var(x);
+}
+
+void invalid_invocations(int x, const char* str) {
+    // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a valid AMDGCN processor identifier; 'not_an_amdgcn_gfx_id' is not valid
+    // CHECK-DAG: note: valid AMDGCN processor identifiers are: {{.*}}gfx{{.*}}
+    if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return;
+    // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal
+    if (__builtin_amdgcn_processor_is(str)) return;
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}__builtin_amdgcn_s_sleep_var{{.*}} is not valid
+    if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return;
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}str{{.*}} is not valid
+    else if (__builtin_amdgcn_is_invocable(str)) return;
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}x{{.*}} is not valid
+    else if (__builtin_amdgcn_is_invocable(x)) return;
+    // CHECK: error: use of undeclared identifier '__builtin_ia32_pause'
+    else if (__builtin_amdgcn_is_invocable(__builtin_ia32_pause)) return;
+}
+
+bool return_needs_cast() {
+    // CHECK: error: '__builtin_amdgcn_processor_is("gfx900")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+    return __builtin_amdgcn_processor_is("gfx900");
+}
--- a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
@ -5,7 +5,7 @@

 // RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip \
 // RUN:  -aux-triple x86_64-pc-windows-msvc -fcuda-is-device -emit-llvm %s \
-// RUN:  -o - | FileCheck %s
+// RUN:  -o - | FileCheck %s --check-prefix=AMDGCNSPIRV

 #include "Inputs/cuda.h"

@ -28,6 +28,25 @@
 // CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z16use_dispatch_ptrPi(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[OUT:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[DISPATCH_PTR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr [[DISPATCH_PTR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void use_dispatch_ptr(int* out) {
  const int* dispatch_ptr = (const int*)__builtin_amdgcn_dispatch_ptr();
  *out = *dispatch_ptr;
@ -52,6 +71,25 @@ __global__ void use_dispatch_ptr(int* out) {
 // CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z13use_queue_ptrPi(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[OUT:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[QUEUE_PTR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr [[QUEUE_PTR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void use_queue_ptr(int* out) {
  const int* queue_ptr = (const int*)__builtin_amdgcn_queue_ptr();
  *out = *queue_ptr;
@ -76,14 +114,30 @@ __global__ void use_queue_ptr(int* out) {
 // CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z19use_implicitarg_ptrPi(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[OUT:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[IMPLICITARG_PTR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr [[IMPLICITARG_PTR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void use_implicitarg_ptr(int* out) {
  const int* implicitarg_ptr = (const int*)__builtin_amdgcn_implicitarg_ptr();
  *out = *implicitarg_ptr;
 }

-__global__
-    //
-    void
 // CHECK-LABEL: @_Z12test_ds_fmaxf(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca float, align 4
@ -96,7 +150,21 @@ __global__
 // CHECK-NEXT:    store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
 // CHECK-NEXT:    ret void
 //
-    test_ds_fmax(float src) {
+// AMDGCNSPIRV-LABEL: @_Z12test_ds_fmaxf(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[X:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw fmax ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]] monotonic, align 4
+// AMDGCNSPIRV-NEXT:    store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
+//
+__global__ void test_ds_fmax(float src) {
+//
+//
  __shared__ float shared;
  volatile float x = __builtin_amdgcn_ds_fmaxf(&shared, src, 0, 0, false);
 }
@ -113,6 +181,18 @@ __global__
 // CHECK-NEXT:    store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z12test_ds_faddf(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[X:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]] monotonic, align 4
+// AMDGCNSPIRV-NEXT:    store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void test_ds_fadd(float src) {
  __shared__ float shared;
  volatile float x = __builtin_amdgcn_ds_faddf(&shared, src, 0, 0, false);
@ -139,6 +219,27 @@ __global__ void test_ds_fadd(float src) {
 // CHECK-NEXT:    store volatile float [[TMP3]], ptr addrspace(4) [[X_ASCAST]], align 4
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z12test_ds_fminfPf(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[SHARED:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[X:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3)
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
+// AMDGCNSPIRV-NEXT:    store volatile float [[TMP3]], ptr addrspace(4) [[X_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void test_ds_fmin(float src, float *shared) {
  volatile float x = __builtin_amdgcn_ds_fminf(shared, src, 0, 0, false);
 }
@ -155,6 +256,11 @@ __device__ void test_ret_builtin_nondef_addrspace() {
 // CHECK-NEXT:    call addrspace(4) void @llvm.amdgcn.endpgm()
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z6endpgmv(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.amdgcn.endpgm()
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void endpgm() {
  __builtin_amdgcn_endpgm();
 }
@ -183,6 +289,28 @@ __global__ void endpgm() {
 // CHECK-NEXT:    store i64 [[TMP2]], ptr addrspace(4) [[TMP3]], align 8
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z14test_uicmp_i64Pyyy(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[OUT:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// AMDGCNSPIRV-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store i64 [[A:%.*]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store i64 [[B:%.*]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(4) [[B_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = call addrspace(4) i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP0]], i64 [[TMP1]], i32 35)
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP2]], ptr addrspace(4) [[TMP3]], align 8
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, unsigned long long b)
 {
  *out = __builtin_amdgcn_uicmpl(a, b, 30+5);
@ -199,14 +327,39 @@ __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, un
 // CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = call addrspace(4) i64 @llvm.amdgcn.s.memtime()
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    store i64 [[TMP0]], ptr addrspace(4) [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata [[META5:![0-9]+]])
+// CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+// CHECK:       if.then:
+// CHECK-NEXT:    [[TMP1:%.*]] = call addrspace(4) i64 @llvm.amdgcn.s.memtime()
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i64 [[TMP1]], ptr addrspace(4) [[TMP2]], align 8
+// CHECK-NEXT:    br label [[IF_END]]
+// CHECK:       if.end:
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z14test_s_memtimePy(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[OUT:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata [[META7:![0-9]+]])
+// AMDGCNSPIRV-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+// AMDGCNSPIRV:       if.then:
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = call addrspace(4) i64 @llvm.amdgcn.s.memtime()
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP1]], ptr addrspace(4) [[TMP2]], align 8
+// AMDGCNSPIRV-NEXT:    br label [[IF_END]]
+// AMDGCNSPIRV:       if.end:
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void test_s_memtime(unsigned long long* out)
 {
-  *out = __builtin_amdgcn_s_memtime();
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_memtime))
+    *out = __builtin_amdgcn_s_memtime();
 }

 // Check a generic pointer can be passed as a shared pointer and a generic pointer.
@ -232,9 +385,32 @@ __device__ void func(float *x);
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
 // CHECK-NEXT:    store volatile float [[TMP3]], ptr addrspace(4) [[X_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP4]]) #[[ATTR7:[0-9]+]]
+// CHECK-NEXT:    call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP4]]) #[[ATTR8:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z17test_ds_fmin_funcfPf(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[SHARED:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[X:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3)
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
+// AMDGCNSPIRV-NEXT:    store volatile float [[TMP3]], ptr addrspace(4) [[X_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP4]]) #[[ATTR8:[0-9]+]]
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
  volatile float x = __builtin_amdgcn_ds_fminf(shared, src, 0, 0, false);
  func(shared);
@ -258,6 +434,24 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
 // CHECK-NEXT:    store i8 [[STOREDV]], ptr addrspace(4) [[RET_ASCAST]], align 1
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z14test_is_sharedPf(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[X:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[RET:%.*]] = alloca i8, align 1
+// AMDGCNSPIRV-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.shared(ptr [[TMP1]])
+// AMDGCNSPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP2]] to i8
+// AMDGCNSPIRV-NEXT:    store i8 [[STOREDV]], ptr addrspace(4) [[RET_ASCAST]], align 1
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void test_is_shared(float *x){
  bool ret = __builtin_amdgcn_is_shared(x);
 }
@ -280,6 +474,24 @@ __global__ void test_is_shared(float *x){
 // CHECK-NEXT:    store i8 [[STOREDV]], ptr addrspace(4) [[RET_ASCAST]], align 1
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z15test_is_privatePi(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[X:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[RET:%.*]] = alloca i8, align 1
+// AMDGCNSPIRV-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.private(ptr [[TMP1]])
+// AMDGCNSPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP2]] to i8
+// AMDGCNSPIRV-NEXT:    store i8 [[STOREDV]], ptr addrspace(4) [[RET_ASCAST]], align 1
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void test_is_private(int *x){
  bool ret = __builtin_amdgcn_is_private(x);
 }
--- a/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip
+++ b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip
@ -21,26 +21,30 @@ constexpr static bool BountCtrl()
    return true & false;
 }

-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 16, i32 0, i32 0, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 16, i32 0, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_2(int* out, int a, int b)
 {
-  *out = __builtin_amdgcn_update_dpp(a, b, OpCtrl(), 0, 0, false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(a, b, OpCtrl(), 0, 0, false);
 }

-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 4, i32 0, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 4, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_3(int* out, int a, int b)
 {
-  *out = __builtin_amdgcn_update_dpp(a, b, 0, RowMask(), 0, false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(a, b, 0, RowMask(), 0, false);
 }

-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 3, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 3, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_4(int* out, int a, int b)
 {
-  *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, BankMask(), false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, BankMask(), false);
 }

-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 0, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_5(int* out, int a, int b)
 {
-  *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, 0, BountCtrl());
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, 0, BountCtrl());
 }
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@ -12,78 +12,88 @@ typedef unsigned long ulong;
 // CHECK-LABEL: @test_permlane16(
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
 void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) {
-  *out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane16))
+    *out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
 }

 // CHECK-LABEL: @test_permlanex16(
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
 void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) {
-  *out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16))
+    *out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
 }

 // CHECK-LABEL: @test_mov_dpp8_uint(
 // CHECK:      {{.*}}call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %a, i32 1)
-// CHECK-NEXT: store i32 %0,
+// CHECK-NEXT: store i32 %[[#]],
 void test_mov_dpp8_uint(global uint* out, uint a) {
-  *out = __builtin_amdgcn_mov_dpp8(a, 1);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
+    *out = __builtin_amdgcn_mov_dpp8(a, 1);
 }

 // CHECK-LABEL: @test_mov_dpp8_long(
 // CHECK:      {{.*}}call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %a, i32 1)
-// CHECK-NEXT: store i64 %0,
+// CHECK-NEXT: store i64 %[[#]],
 void test_mov_dpp8_long(global long* out, long a) {
-  *out = __builtin_amdgcn_mov_dpp8(a, 1);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
+    *out = __builtin_amdgcn_mov_dpp8(a, 1);
 }

 // CHECK-LABEL: @test_mov_dpp8_float(
-// CHECK:      %0 = bitcast float %a to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: store i32 %1,
+// CHECK:      %[[BC:[0-9]+]] = bitcast float %a to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %[[BC]], i32 1)
+// CHECK-NEXT: store i32 %[[DPP_RET]],
 void test_mov_dpp8_float(global float* out, float a) {
-  *out = __builtin_amdgcn_mov_dpp8(a, 1);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
+    *out = __builtin_amdgcn_mov_dpp8(a, 1);
 }

 // CHECK-LABEL: @test_mov_dpp8_double
-// CHECK:      %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %0, i32 1)
-// CHECK-NEXT: store i64 %1,
+// CHECK:      %[[BC:[0-9]+]] = bitcast double %x to i64
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %[[BC]], i32 1)
+// CHECK-NEXT: store i64 %[[DPP_RET]],
 void test_mov_dpp8_double(double x, global double *p) {
-  *p = __builtin_amdgcn_mov_dpp8(x, 1);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
+    *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }

 // CHECK-LABEL: @test_mov_dpp8_short
-// CHECK:      %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i16 %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %[[ZEXT]], i32 1)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i16
+// CHECK-NEXT: store i16 %[[TRUNC]],
 void test_mov_dpp8_short(short x, global short *p) {
-  *p = __builtin_amdgcn_mov_dpp8(x, 1);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
+    *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }

 // CHECK-LABEL: @test_mov_dpp8_char
-// CHECK:      %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i8 %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %[[ZEXT]], i32 1)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i8
+// CHECK-NEXT: store i8 %[[TRUNC]],
 void test_mov_dpp8_char(char x, global char *p) {
-  *p = __builtin_amdgcn_mov_dpp8(x, 1);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
+    *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }

 // CHECK-LABEL: @test_mov_dpp8_half
-// CHECK:      %0 = load i16,
-// CHECK:      %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %1, i32 1)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK:      %[[LD:[0-9]+]] = load i16,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i16 %[[LD]] to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %[[ZEXT]], i32 1)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i16
+// CHECK-NEXT: store i16 %[[TRUNC]],
 void test_mov_dpp8_half(half *x, global half *p) {
-  *p = __builtin_amdgcn_mov_dpp8(*x, 1);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
+    *p = __builtin_amdgcn_mov_dpp8(*x, 1);
 }

 // CHECK-LABEL: @test_s_memtime
 // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.memtime()
 void test_s_memtime(global ulong* out)
 {
-  *out = __builtin_amdgcn_s_memtime();
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_memtime))
+    *out = __builtin_amdgcn_s_memtime();
 }

 // CHECK-LABEL: @test_groupstaticsize
@ -97,5 +107,6 @@ void test_groupstaticsize(global uint* out)
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.ballot.i32(i1 %{{.+}})
 void test_ballot_wave32(global uint* out, int a, int b)
 {
-  *out = __builtin_amdgcn_ballot_w32(a == b);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ballot_w32))
+    *out = __builtin_amdgcn_ballot_w32(a == b);
 }
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
@ -18,36 +18,41 @@ typedef uint uint4 __attribute__((ext_vector_type(4)));
 // CHECK-LABEL: @test_s_sendmsg_rtn(
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 0)
 void test_s_sendmsg_rtn(global uint* out) {
-  *out = __builtin_amdgcn_s_sendmsg_rtn(0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sendmsg_rtn))
+    *out = __builtin_amdgcn_s_sendmsg_rtn(0);
 }

 // CHECK-LABEL: @test_s_sendmsg_rtnl(
 // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 0)
 void test_s_sendmsg_rtnl(global ulong* out) {
-  *out = __builtin_amdgcn_s_sendmsg_rtnl(0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sendmsg_rtnl))
+    *out = __builtin_amdgcn_s_sendmsg_rtnl(0);
 }

 // CHECK-LABEL: @test_ds_bvh_stack_rtn(
-// CHECK: %0 = tail call{{.*}} { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data, <4 x i32> %data1, i32 128)
-// CHECK: %1 = extractvalue { i32, i32 } %0, 0
-// CHECK: %2 = extractvalue { i32, i32 } %0, 1
-// CHECK: %3 = insertelement <2 x i32> poison, i32 %1, i64 0
-// CHECK: %4 = insertelement <2 x i32> %3, i32 %2, i64 1
+// CHECK: %[[#BVH_STACK:]] = tail call{{.*}} { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data, <4 x i32> %data1, i32 128)
+// CHECK: %[[#RET_FIRST:]] = extractvalue { i32, i32 } %[[#BVH_STACK]], 0
+// CHECK: %[[#RET_SECOND:]] = extractvalue { i32, i32 } %[[#BVH_STACK]], 1
+// CHECK: %[[#OUT:]] = insertelement <2 x i32> poison, i32 %[[#RET_FIRST]], i64 0
+// CHECK: %{{.*}} = insertelement <2 x i32> %[[#OUT]], i32 %[[#RET_SECOND]], i64 1
 void test_ds_bvh_stack_rtn(global uint2* out, uint addr, uint data, uint4 data1)
 {
-  *out = __builtin_amdgcn_ds_bvh_stack_rtn(addr, data, data1, 128);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ds_bvh_stack_rtn))
+    *out = __builtin_amdgcn_ds_bvh_stack_rtn(addr, data, data1, 128);
 }

 // CHECK-LABEL: @test_permlane64(
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64.i32(i32 %a)
 void test_permlane64(global uint* out, uint a) {
-  *out = __builtin_amdgcn_permlane64(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64))
+    *out = __builtin_amdgcn_permlane64(a);
 }

 // CHECK-LABEL: @test_s_wait_event_export_ready
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.wait.event.export.ready
 void test_s_wait_event_export_ready() {
-  __builtin_amdgcn_s_wait_event_export_ready();
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready))
+    __builtin_amdgcn_s_wait_event_export_ready();
 }

 // CHECK-LABEL: @test_global_add_f32
@ -58,5 +63,6 @@ void test_global_add_f32(float *rtn, global float *addr, float x) {
 #else
 void test_global_add_f32(float *rtn, __attribute__((address_space(1))) float *addr, float x) {
 #endif
-  *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_atomic_fadd_f32))
+    *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
 }
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@ -1,9 +1,9 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
-// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,AMDGCNSPIRV %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN --enable-var-scope %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN --enable-var-scope %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN --enable-var-scope %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN --enable-var-scope %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,AMDGCNSPIRV --enable-var-scope %s

 #define INVALID_MEMORY_SCOPE (__MEMORY_SCOPE_CLUSTR+1)

@ -16,42 +16,48 @@ typedef unsigned int  uint;
 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.div.fixup.f16
 void test_div_fixup_f16(global half* out, half a, half b, half c)
 {
-  *out = __builtin_amdgcn_div_fixuph(a, b, c);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_div_fixuph))
+    *out = __builtin_amdgcn_div_fixuph(a, b, c);
 }

 // CHECK-LABEL: @test_rcp_f16
 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.rcp.f16
 void test_rcp_f16(global half* out, half a)
 {
-  *out = __builtin_amdgcn_rcph(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_rcph))
+    *out = __builtin_amdgcn_rcph(a);
 }

 // CHECK-LABEL: @test_sqrt_f16
 // CHECK: {{.*}}call{{.*}} half @llvm.{{((amdgcn.){0,1})}}sqrt.f16
 void test_sqrt_f16(global half* out, half a)
 {
-  *out = __builtin_amdgcn_sqrth(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_sqrth))
+    *out = __builtin_amdgcn_sqrth(a);
 }

 // CHECK-LABEL: @test_rsq_f16
 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.rsq.f16
 void test_rsq_f16(global half* out, half a)
 {
-  *out = __builtin_amdgcn_rsqh(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_rsqh))
+    *out = __builtin_amdgcn_rsqh(a);
 }

 // CHECK-LABEL: @test_sin_f16
 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.sin.f16
 void test_sin_f16(global half* out, half a)
 {
-  *out = __builtin_amdgcn_sinh(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_sinh))
+    *out = __builtin_amdgcn_sinh(a);
 }

 // CHECK-LABEL: @test_cos_f16
 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.cos.f16
 void test_cos_f16(global half* out, half a)
 {
-  *out = __builtin_amdgcn_cosh(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_cosh))
+    *out = __builtin_amdgcn_cosh(a);
 }

 // CHECK-LABEL: @test_ldexp_f16
@ -59,179 +65,202 @@ void test_cos_f16(global half* out, half a)
 // CHECK: {{.*}}call{{.*}} half @llvm.ldexp.f16.i16(half %a, i16 [[TRUNC]])
 void test_ldexp_f16(global half* out, half a, int b)
 {
-  *out = __builtin_amdgcn_ldexph(a, b);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ldexph))
+    *out = __builtin_amdgcn_ldexph(a, b);
 }

 // CHECK-LABEL: @test_frexp_mant_f16
 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.frexp.mant.f16
 void test_frexp_mant_f16(global half* out, half a)
 {
-  *out = __builtin_amdgcn_frexp_manth(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_frexp_manth))
+    *out = __builtin_amdgcn_frexp_manth(a);
 }

 // CHECK-LABEL: @test_frexp_exp_f16
 // CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.frexp.exp.i16.f16
 void test_frexp_exp_f16(global short* out, half a)
 {
-  *out = __builtin_amdgcn_frexp_exph(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_frexp_exph))
+    *out = __builtin_amdgcn_frexp_exph(a);
 }

 // CHECK-LABEL: @test_fract_f16
 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.fract.f16
 void test_fract_f16(global half* out, half a)
 {
-  *out = __builtin_amdgcn_fracth(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_fracth))
+    *out = __builtin_amdgcn_fracth(a);
 }

 // CHECK-LABEL: @test_class_f16
 // CHECK: {{.*}}call{{.*}} i1 @llvm.amdgcn.class.f16
 void test_class_f16(global half* out, half a, int b)
 {
-  *out = __builtin_amdgcn_classh(a, b);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_classh))
+    *out = __builtin_amdgcn_classh(a, b);
 }

 // CHECK-LABEL: @test_s_memrealtime
 // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.memrealtime()
 void test_s_memrealtime(global ulong* out)
 {
-  *out = __builtin_amdgcn_s_memrealtime();
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_memrealtime))
+    *out = __builtin_amdgcn_s_memrealtime();
 }

 // CHECK-LABEL: @test_s_dcache_wb()
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.dcache.wb()
 void test_s_dcache_wb()
 {
-  __builtin_amdgcn_s_dcache_wb();
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_dcache_wb))
+    __builtin_amdgcn_s_dcache_wb();
 }

 // CHECK-LABEL: @test_mov_dpp_int
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %src, i32 0, i32 0, i32 0, i1 false)
 void test_mov_dpp_int(global int* out, int src)
 {
-  *out = __builtin_amdgcn_mov_dpp(src, 0, 0, 0, false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
+    *out = __builtin_amdgcn_mov_dpp(src, 0, 0, 0, false);
 }

 // CHECK-LABEL: @test_mov_dpp_long
-// CHECK:      %0 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %x, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %0,
+// CHECK:      %[[DPP_RET:[0-9]+]] = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i64 %[[DPP_RET]],
 void test_mov_dpp_long(long x, global long *p) {
-  *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
+    *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }

 // CHECK-LABEL: @test_mov_dpp_float
-// CHECK:      %0 = bitcast float %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i32 %1,
+// CHECK:      %[[BC:[0-9]+]] = bitcast float %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %[[BC]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i32 %[[DPP_RET]],
 void test_mov_dpp_float(float x, global float *p) {
-  *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
+    *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }

 // CHECK-LABEL: @test_mov_dpp_double
-// CHECK:      %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %1,
+// CHECK:      %[[BC:[0-9]+]] = bitcast double %x to i64
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %[[BC]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i64 %[[DPP_RET]],
 void test_mov_dpp_double(double x, global double *p) {
-  *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
+    *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }

 // CHECK-LABEL: @test_mov_dpp_short
-// CHECK:      %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i16 %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %[[ZEXT]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i16
+// CHECK-NEXT: store i16 %[[TRUNC]],
 void test_mov_dpp_short(short x, global short *p) {
-  *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
+    *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }

 // CHECK-LABEL: @test_mov_dpp_char
-// CHECK:      %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i8 %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %[[ZEXT]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i8
+// CHECK-NEXT: store i8 %[[TRUNC]],
 void test_mov_dpp_char(char x, global char *p) {
-  *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
+    *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }

 // CHECK-LABEL: @test_mov_dpp_half
-// CHECK:      %0 = load i16,
-// CHECK:      %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %1, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK:      %[[LD:[0-9]+]] = load i16,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i16 %[[LD]] to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %[[ZEXT]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i16
+// CHECK-NEXT: store i16 %[[TRUNC]],
 void test_mov_dpp_half(half *x, global half *p) {
-  *p = __builtin_amdgcn_mov_dpp(*x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
+    *p = __builtin_amdgcn_mov_dpp(*x, 0x101, 0xf, 0xf, 0);
 }

 // CHECK-LABEL: @test_update_dpp_int
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %arg1, i32 %arg2, i32 0, i32 0, i32 0, i1 false)
 void test_update_dpp_int(global int* out, int arg1, int arg2)
 {
-  *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false);
 }

 // CHECK-LABEL: @test_update_dpp_long
-// CHECK:      %0 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %x, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %0,
+// CHECK:      %[[DPP_RET:[0-9]+]] = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i64 %[[DPP_RET]],
 void test_update_dpp_long(long x, global long *p) {
-  *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }

 // CHECK-LABEL: @test_update_dpp_float
-// CHECK:      %0 = bitcast float %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i32 %1,
+// CHECK:      %[[BC:[0-9]+]] = bitcast float %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %[[BC]], i32 %[[BC]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i32 %[[DPP_RET]],
 void test_update_dpp_float(float x, global float *p) {
-  *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }

 // CHECK-LABEL: @test_update_dpp_double
-// CHECK:      %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %0, i64 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %1,
+// CHECK:      %[[BC:[0-9]+]] = bitcast double %x to i64
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %[[BC]], i64 %[[BC]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i64 %[[DPP_RET]],
 void test_update_dpp_double(double x, global double *p) {
-  *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }

 // CHECK-LABEL: @test_update_dpp_short
-// CHECK:      %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i16 %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %[[ZEXT]], i32 %[[ZEXT]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i16
+// CHECK-NEXT: store i16 %[[TRUNC]],
 void test_update_dpp_short(short x, global short *p) {
-  *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }

 // CHECK-LABEL: @test_update_dpp_char
-// CHECK:      %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i8 %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %[[ZEXT]], i32 %[[ZEXT]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i8
+// CHECK-NEXT: store i8 %[[TRUNC]],
 void test_update_dpp_char(char x, global char *p) {
-  *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }

 // CHECK-LABEL: @test_update_dpp_half
-// CHECK:      %0 = load i16,
-// CHECK:      %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %1, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK:      %[[LD:[0-9]+]] = load i16,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i16 %[[LD]] to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %[[ZEXT]], i32 %[[ZEXT]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i16
+// CHECK-NEXT: store i16 %[[TRUNC]],
 void test_update_dpp_half(half *x, global half *p) {
-  *p = __builtin_amdgcn_update_dpp(*x, *x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *p = __builtin_amdgcn_update_dpp(*x, *x, 0x101, 0xf, 0xf, 0);
 }

 // CHECK-LABEL: @test_update_dpp_int_uint
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %arg1, i32 %arg2, i32 0, i32 0, i32 0, i1 false)
 void test_update_dpp_int_uint(global int* out, int arg1, unsigned int arg2)
 {
-  *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false);
 }

 // CHECK-LABEL: @test_update_dpp_lit_int
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 5, i32 %arg1, i32 0, i32 0, i32 0, i1 false)
 void test_update_dpp_lit_int(global int* out, int arg1)
 {
-  *out = __builtin_amdgcn_update_dpp(5, arg1, 0, 0, 0, false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(5, arg1, 0, 0, 0, false);
 }

 __constant int gi = 5;
@ -240,7 +269,8 @@ __constant int gi = 5;
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 5, i32 %arg1, i32 0, i32 0, i32 0, i1 false)
 void test_update_dpp_const_int(global int* out, int arg1)
 {
-  *out = __builtin_amdgcn_update_dpp(gi, arg1, 0, 0, 0, false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(gi, arg1, 0, 0, 0, false);
 }

 // CHECK-LABEL: @test_ds_fadd
@ -397,14 +427,16 @@ void test_ds_fmaxf(__attribute__((address_space(3))) float *out, float src) {
 // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.memtime()
 void test_s_memtime(global ulong* out)
 {
-  *out = __builtin_amdgcn_s_memtime();
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_memtime))
+    *out = __builtin_amdgcn_s_memtime();
 }

 // CHECK-LABEL: @test_perm
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.perm(i32 %a, i32 %b, i32 %s)
 void test_perm(global uint* out, uint a, uint b, uint s)
 {
-  *out = __builtin_amdgcn_perm(a, b, s);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_perm))
+    *out = __builtin_amdgcn_perm(a, b, s);
 }

 // CHECK-LABEL: @test_groupstaticsize
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@ -1189,31 +1189,36 @@ kernel void test_ds_consume_lds(__attribute__((address_space(1))) int* out, __at
 // CHECK-LABEL: @test_gws_init(
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.init(i32 %value, i32 %id)
 kernel void test_gws_init(uint value, uint id) {
-  __builtin_amdgcn_ds_gws_init(value, id);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ds_gws_init))
+    __builtin_amdgcn_ds_gws_init(value, id);
 }

 // CHECK-LABEL: @test_gws_barrier(
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.barrier(i32 %value, i32 %id)
 kernel void test_gws_barrier(uint value, uint id) {
-  __builtin_amdgcn_ds_gws_barrier(value, id);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ds_gws_barrier))
+    __builtin_amdgcn_ds_gws_barrier(value, id);
 }

 // CHECK-LABEL: @test_gws_sema_v(
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.sema.v(i32 %id)
 kernel void test_gws_sema_v(uint id) {
-  __builtin_amdgcn_ds_gws_sema_v(id);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ds_gws_sema_v))
+    __builtin_amdgcn_ds_gws_sema_v(id);
 }

 // CHECK-LABEL: @test_gws_sema_br(
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.sema.br(i32 %value, i32 %id)
 kernel void test_gws_sema_br(uint value, uint id) {
-  __builtin_amdgcn_ds_gws_sema_br(value, id);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ds_gws_sema_br))
+    __builtin_amdgcn_ds_gws_sema_br(value, id);
 }

 // CHECK-LABEL: @test_gws_sema_p(
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.sema.p(i32 %id)
 kernel void test_gws_sema_p(uint id) {
-  __builtin_amdgcn_ds_gws_sema_p(id);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ds_gws_sema_p))
+    __builtin_amdgcn_ds_gws_sema_p(id);
 }

 // CHECK-LABEL: @test_mbcnt_lo(
--- a/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip
+++ b/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip
@ -0,0 +1,62 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -target-cpu gfx900 -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -target-cpu gfx1201 -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void f(int *ptr, int size, bool f) {
+    int i = 0;
+    if (f)
+        goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+    if (__builtin_amdgcn_processor_is("gfx900")) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+        for (i = 0; i < size; ++i) {
+            label:
+            ptr[i] = i;
+        }
+    }
+}
+
+__device__ void g(int *ptr, int size, bool f) {
+    int i = 0;
+    if (f)
+        goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+        for (i = 0; i < size; ++i) {
+            label:
+            ptr[i] = i;
+        }
+    }
+}
+
+__global__ void h(int *ptr, int size, bool f) {
+    int i = 0;
+    if (f)
+        goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+    if (__builtin_amdgcn_processor_is("gfx900")) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+        for (i = 0; i < size; ++i) {
+            label:
+            ptr[i] = i;
+        }
+    }
+}
+
+__global__ void i(int *ptr, int size, bool f) {
+    int i = 0;
+    if (f)
+        goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+        for (i = 0; i < size; ++i) {
+            label:
+            ptr[i] = i;
+        }
+    }
+}
--- a/clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip
+++ b/clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip
@ -0,0 +1,31 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx900 -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -triple x86_64 -aux-triple amdgcn -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -triple x86_64 -aux-triple spirv64-amd-amdhsa -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+
+__attribute__((device)) auto foo() {
+  return __builtin_amdgcn_processor_is("gfx900");
+}
+
+__attribute__((device)) decltype(auto) bar() {
+  return __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep);
+}
+
+// CHECK: |-TypedefDecl {{.*}} implicit __amdgpu_feature_predicate_t '__amdgpu_feature_predicate_t'
+// CHECK-NEXT: | `-BuiltinType {{.*}} '__amdgpu_feature_predicate_t'
+// CHECK-DAG: |-FunctionDecl {{.*}} foo '__amdgpu_feature_predicate_t ()'
+// CHECK-NEXT: |-CompoundStmt {{.*}}
+// CHECK-NEXT: | `-ReturnStmt {{.*}}
+// CHECK-NEXT: |   `-CallExpr {{.*}} '__amdgpu_feature_predicate_t'
+// CHECK-NEXT: |     |-ImplicitCastExpr {{.*}} '__amdgpu_feature_predicate_t (*)(const char *) noexcept'
+// CHECK-NEXT: |     | `-DeclRefExpr {{.*}} Function {{.*}} '__builtin_amdgcn_processor_is' '__amdgpu_feature_predicate_t (const char *) noexcept'
+// CHECK-NEXT: |     `-StringLiteral {{.*}} "gfx900"
+// CHECK-DAG: |-FunctionDecl {{.*}} bar '__amdgpu_feature_predicate_t ()'
+// CHECK-NEXT: |-CompoundStmt {{.*}}
+// CHECK-NEXT: | `-ReturnStmt {{.*}}
+// CHECK-NEXT: |   `-CallExpr {{.*}} '__amdgpu_feature_predicate_t'
+// CHECK-NEXT: |     |-ImplicitCastExpr {{.*}} '__amdgpu_feature_predicate_t (*)() noexcept' <BuiltinFnToFnPtr>
+// CHECK-NEXT: |     | `-DeclRefExpr {{.*}} Function {{.*}} '__builtin_amdgcn_is_invocable' '__amdgpu_feature_predicate_t () noexcept'
--- a/clang/test/SemaHIP/amdgpu-feature-predicate.hip
+++ b/clang/test/SemaHIP/amdgpu-feature-predicate.hip
@ -0,0 +1,40 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRED: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void foo(__amdgpu_feature_predicate_t l) {
+  decltype(__builtin_amdgcn_processor_is("gfx900")) what; // expected-error {{'what' has type __amdgpu_feature_predicate_t, which is not constructible}}
+  typeof(__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep)) why; // expected-error {{'why' has type __amdgpu_feature_predicate_t, which is not constructible}}
+
+  bool b = true;
+  __amdgpu_feature_predicate_t v = false; // expected-error {{'v' has type __amdgpu_feature_predicate_t, which is not constructible}}
+  static_cast<__amdgpu_feature_predicate_t>(b); // expected-error {{static_cast from 'bool' to '__amdgpu_feature_predicate_t' is not allowed}}
+  dynamic_cast<__amdgpu_feature_predicate_t>(b); // expected-error {{invalid target type '__amdgpu_feature_predicate_t' for dynamic_cast; target type must be a reference or pointer type to a defined class}}
+  reinterpret_cast<__amdgpu_feature_predicate_t>(b); // expected-error {{reinterpret_cast from 'bool' to '__amdgpu_feature_predicate_t' is not allowed}}
+  __amdgpu_feature_predicate_t k; // expected-error {{'k' has type __amdgpu_feature_predicate_t, which is not constructible}}
+  int *ip = (int *)l; // expected-error {{cannot cast from type '__amdgpu_feature_predicate_t' to pointer type 'int *'}}
+  void *vp = (void *)l; // expected-error {{cannot cast from type '__amdgpu_feature_predicate_t' to pointer type 'void *'}}
+}
+
+__global__ void bar(__amdgpu_feature_predicate_t l) {
+  decltype(__builtin_amdgcn_processor_is("gfx900")) what; // expected-error {{'what' has type __amdgpu_feature_predicate_t, which is not constructible}}
+  typeof(__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep)) why; // expected-error {{'why' has type __amdgpu_feature_predicate_t, which is not constructible}}
+
+  bool b = true;
+  __amdgpu_feature_predicate_t v = false; // expected-error {{'v' has type __amdgpu_feature_predicate_t, which is not constructible}}
+  static_cast<__amdgpu_feature_predicate_t>(b); // expected-error {{static_cast from 'bool' to '__amdgpu_feature_predicate_t' is not allowed}}
+  dynamic_cast<__amdgpu_feature_predicate_t>(b); // expected-error {{invalid target type '__amdgpu_feature_predicate_t' for dynamic_cast; target type must be a reference or pointer type to a defined class}}
+  reinterpret_cast<__amdgpu_feature_predicate_t>(b); // expected-error {{reinterpret_cast from 'bool' to '__amdgpu_feature_predicate_t' is not allowed}}
+  __amdgpu_feature_predicate_t k; // expected-error {{'k' has type __amdgpu_feature_predicate_t, which is not constructible}}
+  int *ip = (int *)l; // expected-error {{cannot cast from type '__amdgpu_feature_predicate_t' to pointer type 'int *'}}
+  void *vp = (void *)l; // expected-error {{cannot cast from type '__amdgpu_feature_predicate_t' to pointer type 'void *'}}
+}
+
+static_assert(sizeof(__amdgpu_feature_predicate_t) == 0); // expected-error {{invalid application of 'sizeof' to sizeless type '__amdgpu_feature_predicate_t'}}
+static_assert(alignof(__amdgpu_feature_predicate_t) == 0); // expected-error {{invalid application of 'alignof' to sizeless type '__amdgpu_feature_predicate_t'}};
--- a/clang/test/SemaHIP/amdgpu-feature-predicates-guard-use.hip
+++ b/clang/test/SemaHIP/amdgpu-feature-predicates-guard-use.hip
@ -0,0 +1,115 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wamdgpu-unguarded-builtin-usage %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void g();
+
+__device__ void f(int x, bool b) {
+    long v15_16;
+    __asm volatile("v_lshlrev_b64 v[15:16], 0, %0" : "={v[15:16]}"(v15_16) : "v"(x)); // expected-warning {{the 'v_lshlrev_b64 v[15:16], 0, $0' ASM sequence might be invalid for some AMDGPU targets}}
+    // expected-note@-1 {{enclose the 'v_lshlrev_b64 v[15:16], 0, $0' ASM sequence in a scope controlled by a __builtin_amdgcn_is_processor check to silence this warning}}
+
+    if (__builtin_amdgcn_processor_is("gfx90a")) {
+        long v15_16;
+        __asm volatile("v_lshlrev_b64 v[15:16], 0, %0" : "={v[15:16]}"(v15_16) : "v"(x));
+    }
+
+    if (!__builtin_amdgcn_processor_is("gfx90a")) {
+        long v15_16;
+        __asm volatile("v_lshlrev_b64 v[15:16], 0, %0" : "={v[15:16]}"(v15_16) : "v"(x)); // expected-warning {{the 'v_lshlrev_b64 v[15:16], 0, $0' ASM sequence might be invalid for some AMDGPU targets}}
+        // expected-note@-1 {{enclose the 'v_lshlrev_b64 v[15:16], 0, $0' ASM sequence in a scope controlled by a __builtin_amdgcn_is_processor check to silence this warning}}
+    }
+
+    __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var) ? __builtin_amdgcn_s_sleep_var(x) : __builtin_trap();
+    !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var) ? __builtin_amdgcn_s_sleep_var(x) : __builtin_trap(); // expected-error {{'__builtin_amdgcn_s_sleep_var' cannot be invoked in the current context, as it requires the 'gfx12-insts' feature(s)}}
+
+    const auto lambda = [=] __device__  () {
+        __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
+        // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+        if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+            __builtin_amdgcn_s_sleep_var(x);
+    };
+
+    const auto generic_lambda = [] __device__ (auto&& y) {
+        __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
+        // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+        if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) {
+            __builtin_amdgcn_s_sleep_var(y);
+            // Has the same requirements - gfx12-insts, thus correct, but we should still warn.
+            __builtin_amdgcn_s_barrier_signal_var(nullptr, y); // expected-warning {{'__builtin_amdgcn_s_barrier_signal_var' might be unavailable on some AMDGPU targets}}
+            // expected-note@-1 {{enclose '__builtin_amdgcn_s_barrier_signal_var' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+        }
+    };
+
+    __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
+    // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+    if (__builtin_amdgcn_processor_is("gfx1201")) {
+        if (__builtin_amdgcn_processor_is("gfx906")) // expected-error {{conflicting check for AMDGCN processor '__builtin_amdgcn_processor_is("gfx906")' found in a scope already controlled by a check for AMDGCN processor}}
+        // expected-note@-2 {{predicate guard, with establishes the context, inserted here}}
+            __builtin_trap();
+    }
+
+    if (__builtin_amdgcn_processor_is("gfx900")) {
+        if (__builtin_amdgcn_processor_is("gfx900")) // This is fine, albeit potentially spurious.
+            ++x;
+    }
+
+    if (__builtin_amdgcn_processor_is("gfx1030"))
+        __builtin_amdgcn_s_barrier_signal_isfirst(42); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' cannot be invoked in the current context, as it requires the 'gfx12-insts' feature(s), which 'gfx1030' does not provide}}
+        // expected-note@-2 {{predicate guard, with establishes the context, inserted here}}
+
+    // Direct guard
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep))
+        __builtin_amdgcn_s_sleep(42);
+
+    // Guarded scope
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) {
+        if (b) {
+            g();
+            while (--x > 42)
+                __builtin_amdgcn_s_sleep_var(x);
+        }
+    }
+}
+
+__attribute__((target("arch=gfx1030")))
+__device__ void h(int x) {
+    if (__builtin_amdgcn_processor_is("gfx1030")) // Fine, same processor
+        return;
+
+    long v15_16;
+    __asm volatile("v_lshlrev_b64 v[15:16], 0, %0" : "={v[15:16]}"(v15_16) : "v"(x)); // "Fine", explicit gfx target
+
+    __builtin_amdgcn_s_ttracedata_imm(42); // expected-warning {{'__builtin_amdgcn_s_ttracedata_imm' might be unavailable on some AMDGPU targets}}
+    // expected-note@-1 {{enclose '__builtin_amdgcn_s_ttracedata_imm' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+    __builtin_amdgcn_s_barrier_signal_isfirst(42); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' cannot be invoked in the current context, as it requires the 'gfx12-insts' feature(s), which 'gfx1030' does not provide}}
+    // expected-note@-12 {{predicate guard, with establishes the context, inserted here}}
+
+    if (__builtin_amdgcn_processor_is("gfx906")) // expected-error {{conflicting check for AMDGCN processor '__builtin_amdgcn_processor_is("gfx906")' found in a scope already controlled by a check for AMDGCN processor}}
+    // expected-note@-15 {{predicate guard, with establishes the context, inserted here}}
+        __builtin_trap();
+}
+
+__attribute__((target("gfx11-insts")))
+__device__ void i(int x) {
+    __builtin_amdgcn_s_wait_event_export_ready(); // expected-warning {{'__builtin_amdgcn_s_wait_event_export_ready' might be unavailable on some AMDGPU targets}}
+    // expected-note@-1 {{enclose '__builtin_amdgcn_s_wait_event_export_ready' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+    __builtin_amdgcn_s_barrier_signal_isfirst(42); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' cannot be invoked in the current context, as it requires the 'gfx12-insts' feature(s)}}
+}
+
+__attribute__((target("gfx11-insts,gfx12-insts")))
+__device__ void j(int x) {
+    __builtin_amdgcn_s_wait_event_export_ready(); // expected-warning {{'__builtin_amdgcn_s_wait_event_export_ready' might be unavailable on some AMDGPU targets}}
+    // expected-note@-1 {{enclose '__builtin_amdgcn_s_wait_event_export_ready' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+    __builtin_amdgcn_s_barrier_signal_isfirst(42); // expected-warning {{'__builtin_amdgcn_s_barrier_signal_isfirst' might be unavailable on some AMDGPU targets}}
+    // expected-note@-1 {{enclose '__builtin_amdgcn_s_barrier_signal_isfirst' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+}
--- a/clang/test/SemaHIP/amdgpu-is-invocable.hip
+++ b/clang/test/SemaHIP/amdgpu-is-invocable.hip
@ -0,0 +1,21 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void foo() {
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16))
+        return __builtin_trap();
+}
+
+__global__ void bar() {
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16))
+        return __builtin_trap();
+}
--- a/clang/test/SemaHIP/amdgpu-processor-is.hip
+++ b/clang/test/SemaHIP/amdgpu-processor-is.hip
@ -0,0 +1,21 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void foo() {
+    if (__builtin_amdgcn_processor_is("gfx900"))
+        return __builtin_trap();
+}
+
+__global__ void bar() {
+    if (__builtin_amdgcn_processor_is("gfx900"))
+        return __builtin_trap();
+}
--- a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
+++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
@ -345,6 +345,7 @@ private:
                               .Case("__fp16", "h")
                               .Case("__hlsl_resource_t", "Qr")
                               .Case("__amdgpu_buffer_rsrc_t", "Qb")
+                               .Case("__amdgpu_feature_predicate_t", "Qc")
                               .Case("__amdgpu_texture_t", "Qt")
                               .Case("__int128_t", "LLLi")
                               .Case("_Float16", "x")
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@ -2403,11 +2403,6 @@ static bool generateSpecConstantInst(const SPIRV::IncomingCall *Call,

  switch (Opcode) {
  case SPIRV::OpSpecConstant: {
-    // Build the SpecID decoration.
-    unsigned SpecId =
-        static_cast<unsigned>(getIConstVal(Call->Arguments[0], MRI));
-    buildOpDecorate(Call->ReturnRegister, MIRBuilder, SPIRV::Decoration::SpecId,
-                    {SpecId});
    // Determine the constant MI.
    Register ConstRegister = Call->Arguments[1];
    const MachineInstr *Const = getDefInstrMaybeConstant(ConstRegister, MRI);
@ -2433,6 +2428,11 @@ static bool generateSpecConstantInst(const SPIRV::IncomingCall *Call,
      else
        addNumImm(ConstOperand.getFPImm()->getValueAPF().bitcastToAPInt(), MIB);
    }
+    // Build the SpecID decoration.
+    unsigned SpecId =
+        static_cast<unsigned>(getIConstVal(Call->Arguments[0], MRI));
+    buildOpDecorate(Call->ReturnRegister, MIRBuilder, SPIRV::Decoration::SpecId,
+                    {SpecId});
    return true;
  }
  case SPIRV::OpSpecConstantComposite: {
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@ -4239,7 +4239,7 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
                   .addUse(GR.getSPIRVTypeID(ResType));
    MIB.constrainAllUses(TII, TRI, RBI);
    unsigned SpecId = I.getOperand(2).getImm();
-    buildOpDecorate(I.getOperand(0).getReg(), *MIB.getInstr(), TII,
+    buildOpDecorate(I.getOperand(0).getReg(), *++MIB->getIterator(), TII,
                    SPIRV::Decoration::SpecId, {SpecId});

    return true;
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@ -439,6 +439,9 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
      if (!CF || !CF->isIntrinsic())
        continue;
      auto *II = cast<IntrinsicInst>(Call);
+      if (Intrinsic::isTargetIntrinsic(II->getIntrinsicID()) &&
+          II->getCalledOperand()->getName().starts_with("llvm.spv"))
+        continue;
      switch (II->getIntrinsicID()) {
      case Intrinsic::memset:
      case Intrinsic::bswap:
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
@ -6,8 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// The pass transforms IR globals that cannot be trivially mapped to SPIRV
-// into something that is trival to lower.
+// The pass:
+//   - transforms IR globals that cannot be trivially mapped to SPIRV into
+//     something that is trival to lower;
+//   - for AMDGCN flavoured SPIRV, it assigns unique IDs to the specialisation
+//     constants associated with feature predicates, which were inserted by the
+//     FE when expanding calls to __builtin_amdgcn_processor_is or
+//     __builtin_amdgcn_is_invocable
 //
 //===----------------------------------------------------------------------===//

@ -15,9 +20,15 @@
 #include "SPIRVUtils.h"

 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"

+#include <climits>
+#include <string>
+
 #define DEBUG_TYPE "spirv-prepare-globals"

 using namespace llvm;
@ -71,6 +82,51 @@ bool tryReplaceAliasWithAliasee(GlobalAlias &GA) {
  return true;
 }

+bool tryAssignPredicateSpecConstIDs(Module &M, Function *F) {
+  StringMap<unsigned> IDs;
+  for (auto &&U : F->users()) {
+    auto *CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+
+    auto *SpecID = dyn_cast<ConstantInt>(CI->getArgOperand(0));
+    if (!SpecID)
+      continue;
+
+    unsigned ID = SpecID->getZExtValue();
+    if (ID != UINT32_MAX)
+      continue;
+
+    // Replace placeholder Specialisation Constant IDs with unique IDs
+    // associated with the predicate being evaluated, which is encoded via
+    // spv_assign_name.
+    auto *MD =
+        cast<MDNode>(cast<MetadataAsValue>(CI->getOperand(2))->getMetadata());
+    auto *P = cast<MDString>(MD->getOperand(0));
+
+    ID = IDs.try_emplace(P->getString(), IDs.size()).first->second;
+    CI->setArgOperand(0, ConstantInt::get(CI->getArgOperand(0)->getType(), ID));
+  }
+
+  if (IDs.empty())
+    return false;
+
+  // Store the predicate -> ID mapping as a fixed format string
+  // (predicate ID\0...), for later use during SPIR-V consumption.
+  std::string Tmp;
+  for (auto &&[Predicate, SpecID] : IDs)
+    Tmp.append(Predicate).append(" ").append(utostr(SpecID)).push_back('\0');
+
+  Constant *PredSpecIDStr =
+      ConstantDataArray::getString(M.getContext(), Tmp, false);
+
+  new GlobalVariable(M, PredSpecIDStr->getType(), true,
+                     GlobalVariable::LinkageTypes::ExternalLinkage,
+                     PredSpecIDStr, "llvm.amdgcn.feature.predicate.ids");
+
+  return true;
+}
+
 bool SPIRVPrepareGlobals::runOnModule(Module &M) {
  bool Changed = false;

@ -78,6 +134,16 @@ bool SPIRVPrepareGlobals::runOnModule(Module &M) {
    Changed |= tryReplaceAliasWithAliasee(GA);
  }

+  if (M.getTargetTriple().getVendor() != Triple::AMD)
+    return Changed;
+
+  // TODO: Currently, for AMDGCN flavoured SPIR-V, the symbol can only be
+  //       inserted via feature predicate use, but in the future this will need
+  //       revisiting if we start making more liberal use of the intrinsic.
+  if (Function *F = Intrinsic::getDeclarationIfExists(
+          &M, Intrinsic::spv_named_boolean_spec_constant))
+    Changed |= tryAssignPredicateSpecConstIDs(M, F);
+
  return Changed;
 }
 char SPIRVPrepareGlobals::ID = 0;
--- a/llvm/test/CodeGen/SPIRV/SpecConstants/amdgcnspirv-feature-predicate-specconstant.ll
+++ b/llvm/test/CodeGen/SPIRV/SpecConstants/amdgcnspirv-feature-predicate-specconstant.ll
@ -0,0 +1,281 @@
+; RUN: llc -O0 --verify-machineinstrs -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpName %[[#KERNEL:]] "kernel"
+; CHECK: OpName %[[#FEATURE_PREDICATE_IDS:]] "llvm.amdgcn.feature.predicate.ids"
+; CHECK: OpName %[[#SET_FPENV_I64:]] "spirv.llvm_set_fpenv_i64"
+; CHECK: OpName %[[#ASHR_PK_I8_I32:]] "spirv.llvm_amdgcn_ashr_pk_i8_i32"
+; CHECK: OpName %[[#S_SLEEP_VAR:]] "spirv.llvm_amdgcn_s_sleep_var"
+; CHECK: OpName %[[#S_WAIT_EVENT_EXPORT_READY:]] "spirv.llvm_amdgcn_s_wait_event_export_ready"
+; CHECK: OpName %[[#S_TTRACEDATA_IMM:]] "spirv.llvm_amdgcn_s_ttracedata_imm"
+; CHECK: OpDecorate %[[#IS_GFX950:]] SpecId 8
+; CHECK: OpDecorate %[[#IS_GFX1201:]] SpecId 3
+; CHECK: OpDecorate %[[#HAS_GFX12_INSTS:]] SpecId 7
+; CHECK: OpDecorate %[[#IS_GFX906:]] SpecId 6
+; CHECK: OpDecorate %[[#IS_GFX1010:]] SpecId 4
+; CHECK: OpDecorate %[[#IS_GFX1101:]] SpecId 5
+; CHECK: OpDecorate %[[#IS_GFX1101_1:]] SpecId 5
+; CHECK: OpDecorate %[[#IS_GFX1010_1:]] SpecId 4
+; CHECK: OpDecorate %[[#IS_GFX1201_1:]] SpecId 3
+; CHECK: OpDecorate %[[#HAS_GFX11_INSTS:]] SpecId 0
+; CHECK: OpDecorate %[[#HAS_GFX10_INSTS:]] SpecId 2
+; CHECK: OpDecorate %[[#HAS_GFX1250_INSTS:]] SpecId 1
+; CHECK: OpDecorate %[[#HAS_GFX11_INSTS_1:]] SpecId 0
+; CHECK: %[[#BOOL:]] = OpTypeBool
+; CHECK: %[[#UCHAR:]] = OpTypeInt 8
+; CHECK: %[[#FEATURE_PREDICATE_IDS_MAP_STRLEN:]] = OpConstant %[[#]] 137
+; CHECK: %[[#FEATURE_PREDICATE_IDS_MAP_STRTY:]] = OpTypeArray %[[#UCHAR]] %[[#FEATURE_PREDICATE_IDS_MAP_STRLEN]]
+; CHECK: %[[#FEATURE_PREDICATE_IDS_MAP_STRVAL:]] = OpConstantComposite %[[#FEATURE_PREDICATE_IDS_MAP_STRTY]]
+; CHECK: %[[#FEATURE_PREDICATE_IDS]] = OpVariable %[[#]] CrossWorkgroup %[[#FEATURE_PREDICATE_IDS_MAP_STRVAL]]
+; CHECK: %[[#IS_GFX950]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#IS_GFX1201]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#HAS_GFX12_INSTS]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#IS_GFX906]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#IS_GFX1010]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#IS_GFX1101]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#IS_GFX1101_1]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#IS_GFX1010_1]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#IS_GFX1201_1]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#HAS_GFX11_INSTS]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#HAS_GFX10_INSTS]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#HAS_GFX1250_INSTS]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#HAS_GFX11_INSTS_1]] = OpSpecConstantFalse %[[#BOOL]]
+
+declare void @llvm.amdgcn.s.monitor.sleep(i16 immarg) addrspace(4)
+
+declare void @llvm.amdgcn.s.sleep(i32 immarg) addrspace(4)
+
+declare i1 @llvm.spv.named.boolean.spec.constant(i32, i1, metadata) addrspace(4)
+
+declare i16 @llvm.amdgcn.ashr.pk.i8.i32(i32, i32, i32) addrspace(4) #3
+
+declare void @llvm.set.fpenv.i64(i64) addrspace(4) #4
+
+declare void @llvm.amdgcn.s.sleep.var(i32) addrspace(4) #5
+
+declare void @llvm.amdgcn.s.wait.event.export.ready() addrspace(4) #5
+
+declare void @llvm.amdgcn.s.ttracedata.imm(i16 immarg) addrspace(4) #6
+
+@p = external addrspace(1) global i32
+@g = external addrspace(1) constant i32
+
+define void @kernel() addrspace(4) {
+; CHECK-DAG: %[[#KERNEL]] = OpFunction %39 None %40 ; -- Begin function kernel
+; CHECK-NEXT: %2 = OpLabel
+; CHECK-NEXT: %112 = OpLoad %44 %85 Aligned 4
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX950]] %4 %3
+; CHECK-NEXT: %3 = OpLabel
+; CHECK-NEXT: %113 = OpFunctionCall %39 %[[#SET_FPENV_I64]] %57
+; CHECK-NEXT: OpBranch %5
+; CHECK-NEXT: %4 = OpLabel
+; CHECK-NEXT: %114 = OpFunctionCall %43 %[[#ASHR_PK_I8_I32]] %56 %56 %56
+; CHECK-NEXT: OpBranch %5
+; CHECK-NEXT: %5 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1201]] %7 %6
+; CHECK-NEXT: %6 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX12_INSTS]] %7 %8
+; CHECK-NEXT: %7 = OpLabel
+; CHECK-NEXT: %115 = OpFunctionCall %39 %[[#S_SLEEP_VAR]] %112
+; CHECK-NEXT: OpBranch %8
+; CHECK-NEXT: %8 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX906]] %10 %9
+; CHECK-NEXT: %9 = OpLabel
+; CHECK-NEXT: %116 = OpFunctionCall %39 %[[#S_WAIT_EVENT_EXPORT_READY]]
+; CHECK-NEXT: OpBranch %14
+; CHECK-NEXT: %10 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1010]] %12 %11
+; CHECK-NEXT: %11 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1101]] %12 %13
+; CHECK-NEXT: %12 = OpLabel
+; CHECK-NEXT: %117 = OpFunctionCall %39 %[[#S_TTRACEDATA_IMM]] %55
+; CHECK-NEXT: OpBranch %13
+; CHECK-NEXT: %13 = OpLabel
+; CHECK-NEXT: OpBranch %14
+; CHECK-NEXT: %14 = OpLabel
+; CHECK-NEXT: OpBranch %15
+; CHECK-NEXT: %15 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1101_1]] %16 %17
+; CHECK-NEXT: %16 = OpLabel
+; CHECK-NEXT: %118 = OpLoad %44 %97 Aligned 4
+; CHECK-NEXT: %119 = OpIAdd %44 %118 %112
+; CHECK-NEXT: OpStore %97 %119 Aligned 4
+; CHECK-NEXT: OpBranch %17
+; CHECK-NEXT: %17 = OpLabel
+; CHECK-NEXT: OpBranch %18
+; CHECK-NEXT: %18 = OpLabel
+; CHECK-NEXT: %120 = OpLoad %44 %97 Aligned 4
+; CHECK-NEXT: %121 = OpISub %44 %120 %112
+; CHECK-NEXT: OpStore %97 %121 Aligned 4
+; CHECK-NEXT: OpBranch %19
+; CHECK-NEXT: %19 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1010_1]] %20 %21
+; CHECK-NEXT: %20 = OpLabel
+; CHECK-NEXT: OpBranch %21
+; CHECK-NEXT: %21 = OpLabel
+; CHECK-NEXT: %122 = OpPhi %48 %54 %19 %54 %20
+; CHECK-NEXT: OpBranchConditional %122 %18 %22
+; CHECK-NEXT: %22 = OpLabel
+; CHECK-NEXT: OpBranch %23
+; CHECK-NEXT:	%23 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1201_1]] %24 %25
+; CHECK-NEXT: %24 = OpLabel
+; CHECK-NEXT: OpBranch %25
+; CHECK-NEXT: %25 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX11_INSTS]] %29 %26
+; CHECK-NEXT: %26 = OpLabel
+;	CHECK-NEXT: OpBranchConditional %[[#HAS_GFX10_INSTS]] %27 %28
+;	CHECK-NEXT: %27 = OpLabel
+; CHECK-NEXT: %123 = OpFunctionCall %39 %[[#S_TTRACEDATA_IMM]] %55
+; CHECK-NEXT: OpBranch %28
+; CHECK-NEXT:	%28 = OpLabel
+; CHECK-NEXT:	OpBranch %30
+; CHECK-NEXT: %29 = OpLabel
+; CHECK-NEXT: %124 = OpFunctionCall %39 %[[#S_WAIT_EVENT_EXPORT_READY]]
+; CHECK-NEXT: OpBranch %30
+; CHECK-NEXT: %30 = OpLabel
+; CHECK-NEXT: OpBranch %31
+; CHECK-NEXT: %31 = OpLabel
+; CHECK-NEXT: %125 = OpLoad %44 %97 Aligned 4
+; CHECK-NEXT: %126 = OpISub %44 %125 %112
+; CHECK-NEXT: OpStore %97 %126 Aligned 4
+; CHECK-NEXT: OpBranch %32
+; CHECK-NEXT: %32 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX1250_INSTS]] %33 %34
+; CHECK-NEXT: %33 = OpLabel
+; CHECK-NEXT: OpBranch %34
+; CHECK-NEXT: %34 = OpLabel
+; CHECK-NEXT: %127 = OpPhi %48 %54 %32 %54 %33
+; CHECK-NEXT:	OpBranchConditional %127 %31 %35
+; CHECK-NEXT: %35 = OpLabel
+; CHECK-NEXT: OpBranch %36
+; CHECK-NEXT: %36 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX11_INSTS_1]] %37 %38
+
+entry:
+  %x = load i32, ptr addrspace(1) @g
+  %is.gfx950. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !9)
+  br i1 %is.gfx950., label %cond.true, label %cond.false
+cond.true:
+  %0 = call addrspace(4) i16 @llvm.amdgcn.ashr.pk.i8.i32(i32 8, i32 8, i32 8)
+  br label %cond.end
+cond.false:
+  call addrspace(4) void @llvm.set.fpenv.i64(i64 -1)
+  br label %cond.end
+cond.end:
+  %is.gfx1201. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !10)
+  br i1 %is.gfx1201., label %if.then, label %lor.lhs.false
+lor.lhs.false:
+  %has.gfx12-insts. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !11)
+  br i1 %has.gfx12-insts., label %if.then, label %if.end
+if.then:
+  call addrspace(4) void @llvm.amdgcn.s.sleep.var(i32 %x)
+  br label %if.end
+if.end:
+  %is.gfx906. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !12)
+  br i1 %is.gfx906., label %if.else, label %if.then2
+if.then2:
+  call addrspace(4) void @llvm.amdgcn.s.wait.event.export.ready()
+  br label %if.end6
+if.else:
+  %is.gfx1010. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !13)
+  br i1 %is.gfx1010., label %if.then4, label %lor.lhs.false3
+lor.lhs.false3:
+  %is.gfx1101. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !14)
+  br i1 %is.gfx1101., label %if.then4, label %if.end5
+if.then4:
+  call addrspace(4) void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+  br label %if.end5
+if.end5:
+  br label %if.end6
+if.end6:
+  br label %while.cond
+while.cond:
+  %is.gfx1101.7 = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !14)
+  br i1 %is.gfx1101.7, label %while.body, label %while.end
+while.body:
+  %4 = load i32, ptr addrspace(1) @p
+  %add = add i32 %4, %x
+  store i32 %add, ptr addrspace(1) @p
+  br label %while.end
+while.end:
+  br label %do.body
+do.body:
+  %7 = load i32, ptr addrspace(1) @p
+  %sub = sub i32 %7, %x
+  store i32 %sub, ptr addrspace(1) @p
+  br label %do.cond
+do.cond:
+  %is.gfx1010.8 = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !13)
+  br i1 %is.gfx1010.8, label %land.rhs, label %land.end
+land.rhs:
+  br label %land.end
+land.end:
+  %c = phi i1 [ false, %do.cond ], [ false, %land.rhs ]
+  br i1 %c, label %do.body, label %do.end
+do.end:
+  br label %for.cond
+for.cond:
+  %is.gfx1201.9 = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !10)
+  br i1 %is.gfx1201.9, label %for.body, label %for.end
+for.body:
+  br label %for.end
+for.inc:
+  %9 = load i32, ptr addrspace(1) @p
+  %inc = add i32 %9, 1
+  store i32 %inc, ptr addrspace(1) @p
+  br label %for.cond
+for.end:
+  %has.gfx11-insts. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !18)
+  br i1 %has.gfx11-insts., label %if.then10, label %if.else11
+if.then10:
+  call addrspace(4) void @llvm.amdgcn.s.wait.event.export.ready()
+  br label %if.end14
+if.else11:
+  %has.gfx10-insts. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !19)
+  br i1 %has.gfx10-insts., label %if.then12, label %if.end13
+if.then12:
+  call addrspace(4) void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+  br label %if.end13
+if.end13:
+  br label %if.end14
+if.end14:
+  br label %do.body15
+do.body15:
+  %12 = load i32, ptr addrspace(1) @p
+  %sub16 = sub i32 %12, %x
+  store i32 %sub16, ptr addrspace(1) @p
+  br label %do.cond17
+do.cond17:
+  %has.gfx1250-insts. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !20)
+  br i1 %has.gfx1250-insts., label %land.rhs9, label %land.end10
+land.rhs9:
+  br label %land.end10
+land.end10:
+  %c1 = phi i1 [ false, %do.cond17 ], [ false, %land.rhs9 ]
+  br i1 %c1, label %do.body15, label %do.end18
+do.end18:
+  br label %for.cond19
+for.cond19:
+  %has.gfx11-insts.20 = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !18)
+  br i1 %has.gfx11-insts.20, label %for.body21, label %for.end24
+for.body21:
+  br label %for.end24
+for.inc22:
+  %14 = load i32, ptr addrspace(1) @p
+  %inc23 = add i32 %14, 1
+  store i32 %inc23, ptr addrspace(1) @p
+  br label %for.cond19
+for.end24:
+  ret void
+}
+
+!9 = !{!"is.gfx950"}
+!10 = !{!"is.gfx1201"}
+!11 = !{!"has.gfx12-insts"}
+!12 = !{!"is.gfx906"}
+!13 = !{!"is.gfx1010"}
+!14 = !{!"is.gfx1101"}
+!18 = !{!"has.gfx11-insts"}
+!19 = !{!"has.gfx10-insts"}
+!20 = !{!"has.gfx1250-insts"}
--- a/llvm/test/CodeGen/SPIRV/passes/SPIRVPrepareGlobals-predicate-id-string.ll
+++ b/llvm/test/CodeGen/SPIRV/passes/SPIRVPrepareGlobals-predicate-id-string.ll
@ -0,0 +1,153 @@
+; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -print-after-all -o - 2>&1 | FileCheck %s
+; RUN: llc -O3 -mtriple=spirv64-amd-amdhsa %s -print-after-all -o - 2>&1 | FileCheck %s
+
+; CHECK: *** IR Dump After SPIRV prepare global variables (prepare-globals) ***
+
+declare void @llvm.amdgcn.s.monitor.sleep(i16 immarg) addrspace(4)
+
+declare void @llvm.amdgcn.s.sleep(i32 immarg) addrspace(4)
+
+declare i1 @_Z20__spirv_SpecConstantib(i32, i1) addrspace(4)
+
+declare i16 @llvm.amdgcn.ashr.pk.i8.i32(i32, i32, i32) addrspace(4) #3
+
+declare void @llvm.set.fpenv.i64(i64) addrspace(4) #4
+
+declare void @llvm.amdgcn.s.sleep.var(i32) addrspace(4) #5
+
+declare void @llvm.amdgcn.s.wait.event.export.ready() addrspace(4) #5
+
+declare void @llvm.amdgcn.s.ttracedata.imm(i16 immarg) addrspace(4) #6
+
+@p = external addrspace(1) global i32
+@g = external addrspace(1) constant i32
+; CHECK: @llvm.amdgcn.feature.predicate.ids = addrspace(1) constant [137 x i8] c"is.gfx1010 4\00is.gfx950 0\00is.gfx1101 5\00has.gfx11-insts 6\00is.gfx906 3\00is.gfx1201 1\00has.gfx1250-insts 8\00has.gfx10-insts 7\00has.gfx12-insts 2\00"
+
+define void @kernel() addrspace(4) {
+entry:
+  %x = load i32, ptr addrspace(1) @g
+  %is.gfx950. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !9)
+  br i1 %is.gfx950., label %cond.true, label %cond.false
+cond.true:
+  %0 = call addrspace(4) i16 @llvm.amdgcn.ashr.pk.i8.i32(i32 8, i32 8, i32 8)
+  br label %cond.end
+cond.false:
+  call addrspace(4) void @llvm.set.fpenv.i64(i64 -1)
+  br label %cond.end
+cond.end:
+  %is.gfx1201. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !10)
+  br i1 %is.gfx1201., label %if.then, label %lor.lhs.false
+lor.lhs.false:
+  %has.gfx12-insts. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !11)
+  br i1 %has.gfx12-insts., label %if.then, label %if.end
+if.then:
+  call addrspace(4) void @llvm.amdgcn.s.sleep.var(i32 %x)
+  br label %if.end
+if.end:
+  %is.gfx906. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !12)
+  br i1 %is.gfx906., label %if.else, label %if.then2
+if.then2:
+  call addrspace(4) void @llvm.amdgcn.s.wait.event.export.ready()
+  br label %if.end6
+if.else:
+  %is.gfx1010. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !13)
+  br i1 %is.gfx1010., label %if.then4, label %lor.lhs.false3
+lor.lhs.false3:
+  %is.gfx1101. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !14)
+  br i1 %is.gfx1101., label %if.then4, label %if.end5
+if.then4:
+  call addrspace(4) void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+  br label %if.end5
+if.end5:
+  br label %if.end6
+if.end6:
+  br label %while.cond
+while.cond:
+  %is.gfx1101.7 = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !14)
+  br i1 %is.gfx1101.7, label %while.body, label %while.end
+while.body:
+  %4 = load i32, ptr addrspace(1) @p
+  %add = add i32 %4, %x
+  store i32 %add, ptr addrspace(1) @p
+  br label %while.end
+while.end:
+  br label %do.body
+do.body:
+  %7 = load i32, ptr addrspace(1) @p
+  %sub = sub i32 %7, %x
+  store i32 %sub, ptr addrspace(1) @p
+  br label %do.cond
+do.cond:
+  %is.gfx1010.8 = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !13)
+  br i1 %is.gfx1010.8, label %land.rhs, label %land.end
+land.rhs:
+  br label %land.end
+land.end:
+  %c = phi i1 [ false, %do.cond ], [ false, %land.rhs ]
+  br i1 %c, label %do.body, label %do.end
+do.end:
+  br label %for.cond
+for.cond:
+  %is.gfx1201.9 = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !10)
+  br i1 %is.gfx1201.9, label %for.body, label %for.end
+for.body:
+  br label %for.end
+for.inc:
+  %9 = load i32, ptr addrspace(1) @p
+  %inc = add i32 %9, 1
+  store i32 %inc, ptr addrspace(1) @p
+  br label %for.cond
+for.end:
+  %has.gfx11-insts. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !18)
+  br i1 %has.gfx11-insts., label %if.then10, label %if.else11
+if.then10:
+  call addrspace(4) void @llvm.amdgcn.s.wait.event.export.ready()
+  br label %if.end14
+if.else11:
+  %has.gfx10-insts. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !19)
+  br i1 %has.gfx10-insts., label %if.then12, label %if.end13
+if.then12:
+  call addrspace(4) void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+  br label %if.end13
+if.end13:
+  br label %if.end14
+if.end14:
+  br label %do.body15
+do.body15:
+  %12 = load i32, ptr addrspace(1) @p
+  %sub16 = sub i32 %12, %x
+  store i32 %sub16, ptr addrspace(1) @p
+  br label %do.cond17
+do.cond17:
+  %has.gfx1250-insts. = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !20)
+  br i1 %has.gfx1250-insts., label %land.rhs9, label %land.end10
+land.rhs9:
+  br label %land.end10
+land.end10:
+  %c1 = phi i1 [ false, %do.cond17 ], [ false, %land.rhs9 ]
+  br i1 %c1, label %do.body15, label %do.end18
+do.end18:
+  br label %for.cond19
+for.cond19:
+  %has.gfx11-insts.20 = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata !18)
+  br i1 %has.gfx11-insts.20, label %for.body21, label %for.end24
+for.body21:
+  br label %for.end24
+for.inc22:
+  %14 = load i32, ptr addrspace(1) @p
+  %inc23 = add i32 %14, 1
+  store i32 %inc23, ptr addrspace(1) @p
+  br label %for.cond19
+for.end24:
+  ret void
+}
+
+!9 = !{!"is.gfx950"}
+!10 = !{!"is.gfx1201"}
+!11 = !{!"has.gfx12-insts"}
+!12 = !{!"is.gfx906"}
+!13 = !{!"is.gfx1010"}
+!14 = !{!"is.gfx1101"}
+!18 = !{!"has.gfx11-insts"}
+!19 = !{!"has.gfx10-insts"}
+!20 = !{!"has.gfx1250-insts"}