
BLAKE3 is a cryptographic hash function that is secure and very performant. The C implementation originates from https://github.com/BLAKE3-team/BLAKE3/tree/1.3.1/c License is at https://github.com/BLAKE3-team/BLAKE3/blob/1.3.1/LICENSE This patch adds: * `llvm/include/llvm-c/blake3.h`: The BLAKE3 C API * `llvm/include/llvm/Support/BLAKE3.h`: C++ wrapper of the C API * `llvm/lib/Support/BLAKE3`: Directory containing the BLAKE3 C implementation files, including the `LICENSE` file * `llvm/unittests/Support/BLAKE3Test.cpp`: unit tests for the BLAKE3 C++ wrapper This initial patch contains the pristine BLAKE3 sources, a follow-up patch will introduce LLVM-specific prefixes to avoid conflicts if a client also links with its own BLAKE3 version. And here's some timings comparing BLAKE3 with LLVM's SHA1/SHA256/MD5. Timings include `AVX512`, `AVX2`, `neon`, and the generic/portable implementations. The table shows the speed-up multiplier of BLAKE3 for hashing 100 MBs: | Processor | SHA1 | SHA256 | MD5 | |-------------------------|-------|--------|------| | Intel Xeon W (AVX512) | 10.4x | 27x | 9.4x | | Intel Xeon W (AVX2) | 6.5x | 17x | 5.9x | | Intel Xeon W (portable) | 1.3x | 3.3x | 1.1x | | M1Pro (neon) | 2.1x | 4.7x | 2.8x | | M1Pro (portable) | 1.1x | 2.4x | 1.5x | Differential Revision: https://reviews.llvm.org/D121510
161 lines
5.8 KiB
C
161 lines
5.8 KiB
C
#include "blake3_impl.h"
|
|
#include <string.h>
|
|
|
|
INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
|
|
return (w >> c) | (w << (32 - c));
|
|
}
|
|
|
|
INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
|
|
uint32_t x, uint32_t y) {
|
|
state[a] = state[a] + state[b] + x;
|
|
state[d] = rotr32(state[d] ^ state[a], 16);
|
|
state[c] = state[c] + state[d];
|
|
state[b] = rotr32(state[b] ^ state[c], 12);
|
|
state[a] = state[a] + state[b] + y;
|
|
state[d] = rotr32(state[d] ^ state[a], 8);
|
|
state[c] = state[c] + state[d];
|
|
state[b] = rotr32(state[b] ^ state[c], 7);
|
|
}
|
|
|
|
INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
|
|
// Select the message schedule based on the round.
|
|
const uint8_t *schedule = MSG_SCHEDULE[round];
|
|
|
|
// Mix the columns.
|
|
g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
|
|
g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
|
|
g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
|
|
g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
|
|
|
|
// Mix the rows.
|
|
g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
|
|
g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
|
|
g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
|
|
g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
|
|
}
|
|
|
|
INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
|
|
const uint8_t block[BLAKE3_BLOCK_LEN],
|
|
uint8_t block_len, uint64_t counter, uint8_t flags) {
|
|
uint32_t block_words[16];
|
|
block_words[0] = load32(block + 4 * 0);
|
|
block_words[1] = load32(block + 4 * 1);
|
|
block_words[2] = load32(block + 4 * 2);
|
|
block_words[3] = load32(block + 4 * 3);
|
|
block_words[4] = load32(block + 4 * 4);
|
|
block_words[5] = load32(block + 4 * 5);
|
|
block_words[6] = load32(block + 4 * 6);
|
|
block_words[7] = load32(block + 4 * 7);
|
|
block_words[8] = load32(block + 4 * 8);
|
|
block_words[9] = load32(block + 4 * 9);
|
|
block_words[10] = load32(block + 4 * 10);
|
|
block_words[11] = load32(block + 4 * 11);
|
|
block_words[12] = load32(block + 4 * 12);
|
|
block_words[13] = load32(block + 4 * 13);
|
|
block_words[14] = load32(block + 4 * 14);
|
|
block_words[15] = load32(block + 4 * 15);
|
|
|
|
state[0] = cv[0];
|
|
state[1] = cv[1];
|
|
state[2] = cv[2];
|
|
state[3] = cv[3];
|
|
state[4] = cv[4];
|
|
state[5] = cv[5];
|
|
state[6] = cv[6];
|
|
state[7] = cv[7];
|
|
state[8] = IV[0];
|
|
state[9] = IV[1];
|
|
state[10] = IV[2];
|
|
state[11] = IV[3];
|
|
state[12] = counter_low(counter);
|
|
state[13] = counter_high(counter);
|
|
state[14] = (uint32_t)block_len;
|
|
state[15] = (uint32_t)flags;
|
|
|
|
round_fn(state, &block_words[0], 0);
|
|
round_fn(state, &block_words[0], 1);
|
|
round_fn(state, &block_words[0], 2);
|
|
round_fn(state, &block_words[0], 3);
|
|
round_fn(state, &block_words[0], 4);
|
|
round_fn(state, &block_words[0], 5);
|
|
round_fn(state, &block_words[0], 6);
|
|
}
|
|
|
|
void blake3_compress_in_place_portable(uint32_t cv[8],
|
|
const uint8_t block[BLAKE3_BLOCK_LEN],
|
|
uint8_t block_len, uint64_t counter,
|
|
uint8_t flags) {
|
|
uint32_t state[16];
|
|
compress_pre(state, cv, block, block_len, counter, flags);
|
|
cv[0] = state[0] ^ state[8];
|
|
cv[1] = state[1] ^ state[9];
|
|
cv[2] = state[2] ^ state[10];
|
|
cv[3] = state[3] ^ state[11];
|
|
cv[4] = state[4] ^ state[12];
|
|
cv[5] = state[5] ^ state[13];
|
|
cv[6] = state[6] ^ state[14];
|
|
cv[7] = state[7] ^ state[15];
|
|
}
|
|
|
|
void blake3_compress_xof_portable(const uint32_t cv[8],
|
|
const uint8_t block[BLAKE3_BLOCK_LEN],
|
|
uint8_t block_len, uint64_t counter,
|
|
uint8_t flags, uint8_t out[64]) {
|
|
uint32_t state[16];
|
|
compress_pre(state, cv, block, block_len, counter, flags);
|
|
|
|
store32(&out[0 * 4], state[0] ^ state[8]);
|
|
store32(&out[1 * 4], state[1] ^ state[9]);
|
|
store32(&out[2 * 4], state[2] ^ state[10]);
|
|
store32(&out[3 * 4], state[3] ^ state[11]);
|
|
store32(&out[4 * 4], state[4] ^ state[12]);
|
|
store32(&out[5 * 4], state[5] ^ state[13]);
|
|
store32(&out[6 * 4], state[6] ^ state[14]);
|
|
store32(&out[7 * 4], state[7] ^ state[15]);
|
|
store32(&out[8 * 4], state[8] ^ cv[0]);
|
|
store32(&out[9 * 4], state[9] ^ cv[1]);
|
|
store32(&out[10 * 4], state[10] ^ cv[2]);
|
|
store32(&out[11 * 4], state[11] ^ cv[3]);
|
|
store32(&out[12 * 4], state[12] ^ cv[4]);
|
|
store32(&out[13 * 4], state[13] ^ cv[5]);
|
|
store32(&out[14 * 4], state[14] ^ cv[6]);
|
|
store32(&out[15 * 4], state[15] ^ cv[7]);
|
|
}
|
|
|
|
INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
|
|
const uint32_t key[8], uint64_t counter,
|
|
uint8_t flags, uint8_t flags_start,
|
|
uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
|
|
uint32_t cv[8];
|
|
memcpy(cv, key, BLAKE3_KEY_LEN);
|
|
uint8_t block_flags = flags | flags_start;
|
|
while (blocks > 0) {
|
|
if (blocks == 1) {
|
|
block_flags |= flags_end;
|
|
}
|
|
blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
|
|
block_flags);
|
|
input = &input[BLAKE3_BLOCK_LEN];
|
|
blocks -= 1;
|
|
block_flags = flags;
|
|
}
|
|
store_cv_words(out, cv);
|
|
}
|
|
|
|
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
|
|
size_t blocks, const uint32_t key[8],
|
|
uint64_t counter, bool increment_counter,
|
|
uint8_t flags, uint8_t flags_start,
|
|
uint8_t flags_end, uint8_t *out) {
|
|
while (num_inputs > 0) {
|
|
hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
|
|
flags_end, out);
|
|
if (increment_counter) {
|
|
counter += 1;
|
|
}
|
|
inputs += 1;
|
|
num_inputs -= 1;
|
|
out = &out[BLAKE3_OUT_LEN];
|
|
}
|
|
}
|