[Support/BLAKE3] Make g_cpu_features thread safe (#147948)

`g_cpu_features` can be updated multiple times by `get_cpu_features()`, which reports a thread sanitizer error when used with multiple lld threads. This PR updates BLAKE3 to v1.8.2.
2025-07-12 11:02:56 +04:00 · 2025-07-12 11:02:56 +04:00 · d2ad63a193
commit d2ad63a193
parent f6c927e8db
15 changed files with 2661 additions and 91 deletions
--- a/llvm/include/llvm-c/blake3.h
+++ b/llvm/include/llvm-c/blake3.h
@ -25,7 +25,7 @@
 extern "C" {
 #endif

-#define LLVM_BLAKE3_VERSION_STRING "1.3.1"
+#define LLVM_BLAKE3_VERSION_STRING "1.8.2"
 #define LLVM_BLAKE3_KEY_LEN 32
 #define LLVM_BLAKE3_OUT_LEN 32
 #define LLVM_BLAKE3_BLOCK_LEN 64
--- a/llvm/lib/Support/BLAKE3/README.md
+++ b/llvm/lib/Support/BLAKE3/README.md
@ -1,4 +1,4 @@
-Implementation of BLAKE3, originating from https://github.com/BLAKE3-team/BLAKE3/tree/1.3.1/c
+Implementation of BLAKE3, originating from https://github.com/BLAKE3-team/BLAKE3/tree/1.8.2/c

 # Example

--- a/llvm/lib/Support/BLAKE3/blake3.c
+++ b/llvm/lib/Support/BLAKE3/blake3.c
@ -95,24 +95,30 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {

 INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
                              size_t out_len) {
+  if (out_len == 0) {
+      return;
+  }
  uint64_t output_block_counter = seek / 64;
  size_t offset_within_block = seek % 64;
  uint8_t wide_buf[64];
-  while (out_len > 0) {
-    blake3_compress_xof(self->input_cv, self->block, self->block_len,
-                        output_block_counter, self->flags | ROOT, wide_buf);
-    size_t available_bytes = 64 - offset_within_block;
-    size_t memcpy_len;
-    if (out_len > available_bytes) {
-      memcpy_len = available_bytes;
-    } else {
-      memcpy_len = out_len;
-    }
-    memcpy(out, wide_buf + offset_within_block, memcpy_len);
-    out += memcpy_len;
-    out_len -= memcpy_len;
+  if(offset_within_block) {
+    blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
+    const size_t available_bytes = 64 - offset_within_block;
+    const size_t bytes = out_len > available_bytes ? available_bytes : out_len;
+    memcpy(out, wide_buf + offset_within_block, bytes);
+    out += bytes;
+    out_len -= bytes;
    output_block_counter += 1;
-    offset_within_block = 0;
+  }
+  if(out_len / 64) {
+    blake3_xof_many(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64);
+  }
+  output_block_counter += out_len / 64;
+  out += out_len & -64;
+  out_len -= out_len & -64;
+  if(out_len) {
+    blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
+    memcpy(out, wide_buf, out_len);
  }
 }

@ -159,10 +165,10 @@ INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
 // Given some input larger than one chunk, return the number of bytes that
 // should go in the left subtree. This is the largest power-of-2 number of
 // chunks that leaves at least 1 byte for the right subtree.
-INLINE size_t left_len(size_t content_len) {
-  // Subtract 1 to reserve at least one byte for the right side. content_len
+INLINE size_t left_subtree_len(size_t input_len) {
+  // Subtract 1 to reserve at least one byte for the right side. input_len
  // should always be greater than BLAKE3_CHUNK_LEN.
-  size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
+  size_t full_chunks = (input_len - 1) / BLAKE3_CHUNK_LEN;
  return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
 }

@ -251,7 +257,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,

 // The wide helper function returns (writes out) an array of chaining values
 // and returns the length of that array. The number of chaining values returned
-// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
+// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
 // if the input is shorter than that many chunks. The reason for maintaining a
 // wide array of chaining values going back up the tree, is to allow the
 // implementation to hash as many parents in parallel as possible.
@ -259,18 +265,17 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
 // As a special case when the SIMD degree is 1, this function will still return
 // at least 2 outputs. This guarantees that this function doesn't perform the
 // root compression. (If it did, it would use the wrong flags, and also we
-// wouldn't be able to implement exendable ouput.) Note that this function is
+// wouldn't be able to implement extendable output.) Note that this function is
 // not used when the whole input is only 1 chunk long; that's a different
 // codepath.
 //
 // Why not just have the caller split the input on the first update(), instead
 // of implementing this special rule? Because we don't want to limit SIMD or
 // multi-threading parallelism for that update().
-static size_t blake3_compress_subtree_wide(const uint8_t *input,
-                                           size_t input_len,
-                                           const uint32_t key[8],
-                                           uint64_t chunk_counter,
-                                           uint8_t flags, uint8_t *out) {
+size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
+                                    const uint32_t key[8],
+                                    uint64_t chunk_counter, uint8_t flags,
+                                    uint8_t *out, bool use_tbb) {
  // Note that the single chunk case does *not* bump the SIMD degree up to 2
  // when it is 1. If this implementation adds multi-threading in the future,
  // this gives us the option of multi-threading even the 2-chunk case, which
@ -284,7 +289,7 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
  // the input into left and right subtrees. (Note that this is only optimal
  // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
  // of 3 or something, we'll need a more complicated strategy.)
-  size_t left_input_len = left_len(input_len);
+  size_t left_input_len = left_subtree_len(input_len);
  size_t right_input_len = input_len - left_input_len;
  const uint8_t *right_input = &input[left_input_len];
  uint64_t right_chunk_counter =
@ -304,12 +309,24 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
  }
  uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];

-  // Recurse! If this implementation adds multi-threading support in the
-  // future, this is where it will go.
-  size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
-                                               chunk_counter, flags, cv_array);
-  size_t right_n = blake3_compress_subtree_wide(
-      right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
+  // Recurse!
+  size_t left_n = -1;
+  size_t right_n = -1;
+
+#if defined(BLAKE3_USE_TBB)
+  blake3_compress_subtree_wide_join_tbb(
+      key, flags, use_tbb,
+      // left-hand side
+      input, left_input_len, chunk_counter, cv_array, &left_n,
+      // right-hand side
+      right_input, right_input_len, right_chunk_counter, right_cvs, &right_n);
+#else
+  left_n = blake3_compress_subtree_wide(
+      input, left_input_len, key, chunk_counter, flags, cv_array, use_tbb);
+  right_n = blake3_compress_subtree_wide(right_input, right_input_len, key,
+                                         right_chunk_counter, flags, right_cvs,
+                                         use_tbb);
+#endif // BLAKE3_USE_TBB

  // The special case again. If simd_degree=1, then we'll have left_n=1 and
  // right_n=1. Rather than compressing them into a single output, return
@ -335,32 +352,37 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
 //
 // As with compress_subtree_wide(), this function is not used on inputs of 1
 // chunk or less. That's a different codepath.
-INLINE void compress_subtree_to_parent_node(
-    const uint8_t *input, size_t input_len, const uint32_t key[8],
-    uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) {
+INLINE void
+compress_subtree_to_parent_node(const uint8_t *input, size_t input_len,
+                                const uint32_t key[8], uint64_t chunk_counter,
+                                uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN],
+                                bool use_tbb) {
 #if defined(BLAKE3_TESTING)
  assert(input_len > BLAKE3_CHUNK_LEN);
 #endif

  uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
  size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
-                                                chunk_counter, flags, cv_array);
+                                                chunk_counter, flags, cv_array, use_tbb);
  assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
-
-  // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
+  // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
+  // as we just asserted, num_cvs will always be <=2 in that case. But GCC
+  // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
+  // set then it emits incorrect warnings here. We tried a few different
+  // hacks to silence these, but in the end our hacks just produced different
+  // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
+  // desperation, we ifdef out this entire loop when we know it's not needed.
+#if MAX_SIMD_DEGREE_OR_2 > 2
+  // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
  // compress_subtree_wide() returns more than 2 chaining values. Condense
  // them into 2 by forming parent nodes repeatedly.
  uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
-  // The second half of this loop condition is always true, and we just
-  // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
-  // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
-  // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
-  // this code, test it against that version.
-  while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
+  while (num_cvs > 2) {
    num_cvs =
        compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
    memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
  }
+#endif
  memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
 }

@ -432,7 +454,7 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
 //    of the whole tree, and it would need to be ROOT finalized. We can't
 //    compress it until we know.
 // 2) This 64 KiB input might complete a larger tree, whose root node is
-//    similarly going to be the the root of the whole tree. For example, maybe
+//    similarly going to be the root of the whole tree. For example, maybe
 //    we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
 //    node at the root of the 256 KiB subtree until we know how to finalize it.
 //
@ -457,8 +479,8 @@ INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
  self->cv_stack_len += 1;
 }

-void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
-                          size_t input_len) {
+INLINE void blake3_hasher_update_base(blake3_hasher *self, const void *input,
+                                      size_t input_len, bool use_tbb) {
  // Explicitly checking for zero avoids causing UB by passing a null pointer
  // to memcpy. This comes up in practice with things like:
  //   std::vector<uint8_t> v;
@ -544,7 +566,7 @@ void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
      uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
      compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
                                      self->chunk.chunk_counter,
-                                      self->chunk.flags, cv_pair);
+                                      self->chunk.flags, cv_pair, use_tbb);
      hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
      hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
                     self->chunk.chunk_counter + (subtree_chunks / 2));
@ -566,6 +588,20 @@ void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
  }
 }

+void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
+                          size_t input_len) {
+  bool use_tbb = false;
+  blake3_hasher_update_base(self, input, input_len, use_tbb);
+}
+
+#if defined(BLAKE3_USE_TBB)
+void blake3_hasher_update_tbb(blake3_hasher *self, const void *input,
+                              size_t input_len) {
+  bool use_tbb = true;
+  blake3_hasher_update_base(self, input, input_len, use_tbb);
+}
+#endif // BLAKE3_USE_TBB
+
 void llvm_blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
                            size_t out_len) {
  llvm_blake3_hasher_finalize_seek(self, 0, out, out_len);
--- a/llvm/lib/Support/BLAKE3/blake3_avx2.c
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2.c
@ -167,7 +167,7 @@ INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);

-  // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+  // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
  // 11/33.
  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
--- a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S
@ -1786,7 +1786,7 @@ blake3_hash_many_avx2:
        vmovdqu xmmword ptr [rbx+0x10], xmm1
        jmp     4b

-.section .rodata
+.section .rdata
 .p2align  6
 ADD0:
        .long  0, 1, 2, 3, 4, 5, 6, 7
--- a/llvm/lib/Support/BLAKE3/blake3_avx512.c
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512.c
@ -22,10 +22,14 @@ INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
  _mm_storeu_si128((__m128i *)dest, src);
 }

-INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
+INLINE void storeu_256(__m256i src, uint8_t dest[32]) {
  _mm256_storeu_si256((__m256i *)dest, src);
 }

+INLINE void storeu_512(__m512i src, uint8_t dest[64]) {
+  _mm512_storeu_si512((__m512i *)dest, src);
+}
+
 INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }

 INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
@ -429,7 +433,7 @@ INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
 }

 INLINE void transpose_vecs_128(__m128i vecs[4]) {
-  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
  // 22/33. Note that this doesn't split the vector into two lanes, as the
  // AVX2 counterparts do.
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@ -550,6 +554,54 @@ void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
  storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
 }

+static
+void blake3_xof4_avx512(const uint32_t cv[8],
+                        const uint8_t block[BLAKE3_BLOCK_LEN],
+                        uint8_t block_len, uint64_t counter, uint8_t flags,
+                        uint8_t out[4 * 64]) {
+  __m128i h_vecs[8] = {
+      set1_128(cv[0]), set1_128(cv[1]), set1_128(cv[2]), set1_128(cv[3]),
+      set1_128(cv[4]), set1_128(cv[5]), set1_128(cv[6]), set1_128(cv[7]),
+  };
+  uint32_t block_words[16];
+  load_block_words(block, block_words);
+  __m128i msg_vecs[16];
+  for (size_t i = 0; i < 16; i++) {
+      msg_vecs[i] = set1_128(block_words[i]);
+  }
+  __m128i counter_low_vec, counter_high_vec;
+  load_counters4(counter, true, &counter_low_vec, &counter_high_vec);
+  __m128i block_len_vec = set1_128(block_len);
+  __m128i block_flags_vec = set1_128(flags);
+  __m128i v[16] = {
+      h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+      h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+      set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
+      counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+  };
+  round_fn4(v, msg_vecs, 0);
+  round_fn4(v, msg_vecs, 1);
+  round_fn4(v, msg_vecs, 2);
+  round_fn4(v, msg_vecs, 3);
+  round_fn4(v, msg_vecs, 4);
+  round_fn4(v, msg_vecs, 5);
+  round_fn4(v, msg_vecs, 6);
+  for (size_t i = 0; i < 8; i++) {
+      v[i] = xor_128(v[i], v[i+8]);
+      v[i+8] = xor_128(v[i+8], h_vecs[i]);
+  }
+  transpose_vecs_128(&v[0]);
+  transpose_vecs_128(&v[4]);
+  transpose_vecs_128(&v[8]);
+  transpose_vecs_128(&v[12]);
+  for (size_t i = 0; i < 4; i++) {
+      storeu_128(v[i+ 0], &out[(4*i+0) * sizeof(__m128i)]);
+      storeu_128(v[i+ 4], &out[(4*i+1) * sizeof(__m128i)]);
+      storeu_128(v[i+ 8], &out[(4*i+2) * sizeof(__m128i)]);
+      storeu_128(v[i+12], &out[(4*i+3) * sizeof(__m128i)]);
+  }
+}
+
 /*
 * ----------------------------------------------------------------------------
 * hash8_avx512
@ -684,7 +736,7 @@ INLINE void transpose_vecs_256(__m256i vecs[8]) {
  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);

-  // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+  // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
  // 11/33.
  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
@ -802,6 +854,50 @@ void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
  storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
 }

+static
+void blake3_xof8_avx512(const uint32_t cv[8],
+                        const uint8_t block[BLAKE3_BLOCK_LEN],
+                        uint8_t block_len, uint64_t counter, uint8_t flags,
+                        uint8_t out[8 * 64]) {
+  __m256i h_vecs[8] = {
+      set1_256(cv[0]), set1_256(cv[1]), set1_256(cv[2]), set1_256(cv[3]),
+      set1_256(cv[4]), set1_256(cv[5]), set1_256(cv[6]), set1_256(cv[7]),
+  };
+  uint32_t block_words[16];
+  load_block_words(block, block_words);
+  __m256i msg_vecs[16];
+  for (size_t i = 0; i < 16; i++) {
+      msg_vecs[i] = set1_256(block_words[i]);
+  }
+  __m256i counter_low_vec, counter_high_vec;
+  load_counters8(counter, true, &counter_low_vec, &counter_high_vec);
+  __m256i block_len_vec = set1_256(block_len);
+  __m256i block_flags_vec = set1_256(flags);
+  __m256i v[16] = {
+      h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+      h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+      set1_256(IV[0]), set1_256(IV[1]),  set1_256(IV[2]), set1_256(IV[3]),
+      counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+  };
+  round_fn8(v, msg_vecs, 0);
+  round_fn8(v, msg_vecs, 1);
+  round_fn8(v, msg_vecs, 2);
+  round_fn8(v, msg_vecs, 3);
+  round_fn8(v, msg_vecs, 4);
+  round_fn8(v, msg_vecs, 5);
+  round_fn8(v, msg_vecs, 6);
+  for (size_t i = 0; i < 8; i++) {
+      v[i] = xor_256(v[i], v[i+8]);
+      v[i+8] = xor_256(v[i+8], h_vecs[i]);
+  }
+  transpose_vecs_256(&v[0]);
+  transpose_vecs_256(&v[8]);
+  for (size_t i = 0; i < 8; i++) {
+      storeu_256(v[i+0], &out[(2*i+0) * sizeof(__m256i)]);
+      storeu_256(v[i+8], &out[(2*i+1) * sizeof(__m256i)]);
+  }
+}
+
 /*
 * ----------------------------------------------------------------------------
 * hash16_avx512
@ -959,7 +1055,7 @@ INLINE void transpose_vecs_512(__m512i vecs[16]) {
  __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
  __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);

-  // Interleave 64-bit lates. The _0 unpack is lanes
+  // Interleave 64-bit lanes. The _0 unpack is lanes
  // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
  // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
  // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
@ -1047,13 +1143,26 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
 INLINE void load_counters16(uint64_t counter, bool increment_counter,
                            __m512i *out_lo, __m512i *out_hi) {
  const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
-  const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  const __m512i add1 = _mm512_and_si512(mask, add0);
-  __m512i l = _mm512_add_epi32(_mm512_set1_epi32((int32_t)counter), add1);
-  __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT);
-  __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32((int32_t)(counter >> 32)), carry, _mm512_set1_epi32((int32_t)(counter >> 32)), _mm512_set1_epi32(1));
-  *out_lo = l;
-  *out_hi = h;
+  const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
+  const __m512i low_words = _mm512_add_epi32(
+    _mm512_set1_epi32((int32_t)counter),
+    masked_deltas);
+  // The carry bit is 1 if the high bit of the word was 1 before addition and is
+  // 0 after.
+  // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
+  // compute the carry bits here, and originally we did, but that intrinsic is
+  // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
+  const __m512i carries = _mm512_srli_epi32(
+    _mm512_andnot_si512(
+        low_words, // 0 after (gets inverted by andnot)
+        _mm512_set1_epi32((int32_t)counter)), // and 1 before
+    31);
+  const __m512i high_words = _mm512_add_epi32(
+    _mm512_set1_epi32((int32_t)(counter >> 32)),
+    carries);
+  *out_lo = low_words;
+  *out_hi = high_words;
 }

 static
@ -1133,6 +1242,48 @@ void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
  _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
 }

+static
+void blake3_xof16_avx512(const uint32_t cv[8],
+                        const uint8_t block[BLAKE3_BLOCK_LEN],
+                        uint8_t block_len, uint64_t counter, uint8_t flags,
+                        uint8_t out[16 * 64]) {
+  __m512i h_vecs[8] = {
+      set1_512(cv[0]), set1_512(cv[1]), set1_512(cv[2]), set1_512(cv[3]),
+      set1_512(cv[4]), set1_512(cv[5]), set1_512(cv[6]), set1_512(cv[7]),
+  };
+  uint32_t block_words[16];
+  load_block_words(block, block_words);
+  __m512i msg_vecs[16];
+  for (size_t i = 0; i < 16; i++) {
+      msg_vecs[i] = set1_512(block_words[i]);
+  }
+  __m512i counter_low_vec, counter_high_vec;
+  load_counters16(counter, true, &counter_low_vec, &counter_high_vec);
+  __m512i block_len_vec = set1_512(block_len);
+  __m512i block_flags_vec = set1_512(flags);
+  __m512i v[16] = {
+      h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+      h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+      set1_512(IV[0]), set1_512(IV[1]),  set1_512(IV[2]), set1_512(IV[3]),
+      counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+  };
+  round_fn16(v, msg_vecs, 0);
+  round_fn16(v, msg_vecs, 1);
+  round_fn16(v, msg_vecs, 2);
+  round_fn16(v, msg_vecs, 3);
+  round_fn16(v, msg_vecs, 4);
+  round_fn16(v, msg_vecs, 5);
+  round_fn16(v, msg_vecs, 6);
+  for (size_t i = 0; i < 8; i++) {
+      v[i] = xor_512(v[i], v[i+8]);
+      v[i+8] = xor_512(v[i+8], h_vecs[i]);
+  }
+  transpose_vecs_512(&v[0]);
+  for (size_t i = 0; i < 16; i++) {
+      storeu_512(v[i], &out[i * sizeof(__m512i)]);
+  }
+}
+
 /*
 * ----------------------------------------------------------------------------
 * hash_many_avx512
@ -1205,3 +1356,33 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
    out = &out[BLAKE3_OUT_LEN];
  }
 }
+
+void blake3_xof_many_avx512(const uint32_t cv[8],
+                            const uint8_t block[BLAKE3_BLOCK_LEN],
+                            uint8_t block_len, uint64_t counter, uint8_t flags,
+                            uint8_t* out, size_t outblocks) {
+  while (outblocks >= 16) {
+    blake3_xof16_avx512(cv, block, block_len, counter, flags, out);
+    counter += 16;
+    outblocks -= 16;
+    out += 16 * BLAKE3_BLOCK_LEN;
+  }
+  while (outblocks >= 8) {
+    blake3_xof8_avx512(cv, block, block_len, counter, flags, out);
+    counter += 8;
+    outblocks -= 8;
+    out += 8 * BLAKE3_BLOCK_LEN;
+  }
+  while (outblocks >= 4) {
+    blake3_xof4_avx512(cv, block, block_len, counter, flags, out);
+    counter += 4;
+    outblocks -= 4;
+    out += 4 * BLAKE3_BLOCK_LEN;
+  }
+  while (outblocks > 0) {
+    blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
+    counter += 1;
+    outblocks -= 1;
+    out += BLAKE3_BLOCK_LEN;
+  }
+}
--- a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
--- a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_gnu.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_gnu.S
@ -2589,7 +2589,7 @@ blake3_compress_xof_avx512:
        add     rsp, 72
        ret

-.section .rodata
+.section .rdata
 .p2align  6
 INDEX0:
        .long    0,  1,  2,  3, 16, 17, 18, 19
--- a/llvm/lib/Support/BLAKE3/blake3_dispatch.c
+++ b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
@ -4,16 +4,46 @@

 #include "blake3_impl.h"

+#if defined(_MSC_VER)
+#include <Windows.h>
+#endif
+
 #if defined(IS_X86)
 #if defined(_MSC_VER)
 #include <intrin.h>
 #elif defined(__GNUC__)
 #include <immintrin.h>
 #else
-#error "Unimplemented!"
+#undef IS_X86 /* Unimplemented! */
 #endif
 #endif

+#if !defined(BLAKE3_ATOMICS)
+#if defined(__has_include)
+#if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
+#define BLAKE3_ATOMICS 1
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* defined(__has_include) */
+#endif /* BLAKE3_ATOMICS */
+
+#if BLAKE3_ATOMICS
+#define ATOMIC_INT _Atomic int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#elif defined(_MSC_VER)
+#define ATOMIC_INT LONG
+#define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
+#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
+#else
+#define ATOMIC_INT int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#endif
+
 #define MAYBE_UNUSED(x) (void)((x))

 #if defined(IS_X86)
@ -76,7 +106,7 @@ enum cpu_feature {
 #if !defined(BLAKE3_TESTING)
 static /* Allow the variable to be controlled manually for testing */
 #endif
-    enum cpu_feature g_cpu_features = UNDEFINED;
+    ATOMIC_INT g_cpu_features = UNDEFINED;

 LLVM_ATTRIBUTE_USED
 #if !defined(BLAKE3_TESTING)
@ -85,14 +115,16 @@ static
    enum cpu_feature
    get_cpu_features(void) {

-  if (g_cpu_features != UNDEFINED) {
-    return g_cpu_features;
+  /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
+  enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
+  if (features != UNDEFINED) {
+    return features;
  } else {
 #if defined(IS_X86)
    uint32_t regs[4] = {0};
    uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
    (void)edx;
-    enum cpu_feature features = 0;
+    features = 0;
    cpuid(regs, 0);
    const int max_id = *eax;
    cpuid(regs, 1);
@ -102,7 +134,7 @@ static
    if (*edx & (1UL << 26))
      features |= SSE2;
 #endif
-    if (*ecx & (1UL << 0))
+    if (*ecx & (1UL << 9))
      features |= SSSE3;
    if (*ecx & (1UL << 19))
      features |= SSE41;
@ -125,7 +157,7 @@ static
        }
      }
    }
-    g_cpu_features = features;
+    ATOMIC_STORE(g_cpu_features, features);
    return features;
 #else
    /* How to detect NEON? */
@ -192,6 +224,30 @@ void blake3_compress_xof(const uint32_t cv[8],
  blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
 }

+
+void blake3_xof_many(const uint32_t cv[8],
+                     const uint8_t block[BLAKE3_BLOCK_LEN],
+                     uint8_t block_len, uint64_t counter, uint8_t flags,
+                     uint8_t out[64], size_t outblocks) {
+  if (outblocks == 0) {
+    // The current assembly implementation always outputs at least 1 block.
+    return;
+  }
+#if defined(IS_X86)
+  const enum cpu_feature features = get_cpu_features();
+  MAYBE_UNUSED(features);
+#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512)
+  if (features & AVX512VL) {
+    blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
+    return;
+  }
+#endif
+#endif
+  for(size_t i = 0; i < outblocks; ++i) {
+    blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i);
+  }
+}
+
 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
                      size_t blocks, const uint32_t key[8], uint64_t counter,
                      bool increment_counter, uint8_t flags,
--- a/llvm/lib/Support/BLAKE3/blake3_impl.h
+++ b/llvm/lib/Support/BLAKE3/blake3_impl.h
@ -13,6 +13,8 @@

 #include "llvm_blake3_prefix.h"

+#define BLAKE3_PRIVATE
+
 // internal flags
 enum blake3_flags {
  CHUNK_START         = 1 << 0,
@ -32,7 +34,7 @@ enum blake3_flags {
 #define INLINE static inline __attribute__((always_inline))
 #endif

-#if defined(__x86_64__) || defined(_M_X64) 
+#if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC)
 #define IS_X86
 #define IS_X86_64
 #endif
@ -42,7 +44,7 @@ enum blake3_flags {
 #define IS_X86_32
 #endif

-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #define IS_AARCH64
 #endif

@ -54,10 +56,13 @@ enum blake3_flags {
 #endif

 #if !defined(BLAKE3_USE_NEON) 
-  // If BLAKE3_USE_NEON not manually set, autodetect based on
-  // AArch64ness and endianness.
-  #if defined(IS_AARCH64) && !defined(__ARM_BIG_ENDIAN)
-    #define BLAKE3_USE_NEON 1
+  // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
+  #if defined(IS_AARCH64)
+    #if defined(__ARM_BIG_ENDIAN)
+      #define BLAKE3_USE_NEON 0
+    #else
+      #define BLAKE3_USE_NEON 1
+    #endif
  #else
    #define BLAKE3_USE_NEON 0
  #endif
@ -93,7 +98,7 @@ static const uint8_t MSG_SCHEDULE[7][16] = {
 /* x is assumed to be nonzero.       */
 static unsigned int highest_one(uint64_t x) {
 #if defined(__GNUC__) || defined(__clang__)
-  return 63 ^ __builtin_clzll(x);
+  return 63 ^ (unsigned int)__builtin_clzll(x);
 #elif defined(_MSC_VER) && defined(IS_X86_64)
  unsigned long index;
  _BitScanReverse64(&index, x);
@ -123,7 +128,7 @@ static unsigned int highest_one(uint64_t x) {
 // Count the number of 1 bits.
 INLINE unsigned int popcnt(uint64_t x) {
 #if defined(__GNUC__) || defined(__clang__)
-  return __builtin_popcountll(x);
+  return (unsigned int)__builtin_popcountll(x);
 #else
  unsigned int count = 0;
  while (x != 0) {
@ -164,6 +169,13 @@ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
  key_words[7] = load32(&key[7 * 4]);
 }

+INLINE void load_block_words(const uint8_t block[BLAKE3_BLOCK_LEN],
+                             uint32_t block_words[16]) {
+  for (size_t i = 0; i < 16; i++) {
+      block_words[i] = load32(&block[i * 4]);
+  }
+}
+
 INLINE void store32(void *dst, uint32_t w) {
  uint8_t *p = (uint8_t *)dst;
  p[0] = (uint8_t)(w >> 0);
@ -195,6 +207,12 @@ void blake3_compress_xof(const uint32_t cv[8],
                         uint8_t block_len, uint64_t counter, uint8_t flags,
                         uint8_t out[64]);

+LLVM_LIBRARY_VISIBILITY
+void blake3_xof_many(const uint32_t cv[8],
+                     const uint8_t block[BLAKE3_BLOCK_LEN],
+                     uint8_t block_len, uint64_t counter, uint8_t flags,
+                     uint8_t out[64], size_t outblocks);
+
 LLVM_LIBRARY_VISIBILITY
 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
                      size_t blocks, const uint32_t key[8], uint64_t counter,
@ -204,6 +222,22 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
 LLVM_LIBRARY_VISIBILITY
 size_t blake3_simd_degree(void);

+BLAKE3_PRIVATE size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
+                                                   const uint32_t key[8],
+                                                   uint64_t chunk_counter, uint8_t flags,
+                                                   uint8_t *out, bool use_tbb);
+
+#if defined(BLAKE3_USE_TBB)
+BLAKE3_PRIVATE void blake3_compress_subtree_wide_join_tbb(
+    // shared params
+    const uint32_t key[8], uint8_t flags, bool use_tbb,
+    // left-hand side params
+    const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter,
+    uint8_t *l_cvs, size_t *l_n,
+    // right-hand side params
+    const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter,
+    uint8_t *r_cvs, size_t *r_n) NOEXCEPT;
+#endif

 // Declarations for implementation-specific functions.
 LLVM_LIBRARY_VISIBILITY
@ -289,6 +323,14 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
                             uint64_t counter, bool increment_counter,
                             uint8_t flags, uint8_t flags_start,
                             uint8_t flags_end, uint8_t *out);
+
+#if !defined(_WIN32)
+LLVM_LIBRARY_VISIBILITY
+void blake3_xof_many_avx512(const uint32_t cv[8],
+                            const uint8_t block[BLAKE3_BLOCK_LEN],
+                            uint8_t block_len, uint64_t counter, uint8_t flags,
+                            uint8_t* out, size_t outblocks);
+#endif
 #endif
 #endif

--- a/llvm/lib/Support/BLAKE3/blake3_neon.c
+++ b/llvm/lib/Support/BLAKE3/blake3_neon.c
@ -12,14 +12,12 @@

 INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
  // vld1q_u32 has alignment requirements. Don't use it.
-  uint32x4_t x;
-  memcpy(&x, src, 16);
-  return x;
+  return vreinterpretq_u32_u8(vld1q_u8(src));
 }

 INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
  // vst1q_u32 has alignment requirements. Don't use it.
-  memcpy(dest, &src, 16);
+  vst1q_u8(dest, vreinterpretq_u8_u32(src));
 }

 INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
@ -38,19 +36,36 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
 }

 INLINE uint32x4_t rot16_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+  // The straightforward implementation would be two shifts and an or, but that's
+  // slower on microarchitectures we've tested. See
+  // https://github.com/BLAKE3-team/BLAKE3/pull/319.
+  // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+  return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
 }

 INLINE uint32x4_t rot12_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+  return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
 }

 INLINE uint32x4_t rot8_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+#if defined(__clang__)
+  return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
+#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
+  static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
+  return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
+#else 
+  return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
+#endif
 }

 INLINE uint32x4_t rot7_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+  return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
 }

 // TODO: compress_neon
@ -230,7 +245,6 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter,
      counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
 }

-static
 void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
                       const uint32_t key[8], uint64_t counter,
                       bool increment_counter, uint8_t flags,
--- a/llvm/lib/Support/BLAKE3/blake3_sse2.c
+++ b/llvm/lib/Support/BLAKE3/blake3_sse2.c
@ -396,7 +396,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
 }

 INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
-  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
  // 22/33. Note that this doesn't split the vector into two lanes, as the
  // AVX2 counterparts do.
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
--- a/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_gnu.S
+++ b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_windows_gnu.S
@ -2303,7 +2303,7 @@ blake3_compress_xof_sse2:
        ret


-.section .rodata
+.section .rdata
 .p2align  6
 BLAKE3_IV:
        .long  0x6A09E667, 0xBB67AE85
--- a/llvm/lib/Support/BLAKE3/blake3_sse41.c
+++ b/llvm/lib/Support/BLAKE3/blake3_sse41.c
@ -390,7 +390,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
 }

 INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
-  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
  // 22/33. Note that this doesn't split the vector into two lanes, as the
  // AVX2 counterparts do.
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
--- a/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_gnu.S
+++ b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_gnu.S
@ -2044,7 +2044,7 @@ blake3_compress_xof_sse41:
        ret


-.section .rodata
+.section .rdata
 .p2align  6
 BLAKE3_IV:
        .long  0x6A09E667, 0xBB67AE85