/* sha256-arm.c - ARMv8 SHA extensions using C intrinsics */ /* Written and placed in public domain by Jeffrey Walton */ /* Based on code from ARM, and by Johannes Schneiders, Skip */ /* Hovsmith and Barry O'Rourke for the mbedTLS project. */ /* For some reason we need to use the C++ compiler. Otherwise */ /* all the intrinsics functions, like vsha256hq_u32, are missing. */ /* GCC118 on the compile farm with GCC 4.8.5 suffers the issue. */ /* g++ -DTEST_MAIN -march=armv8-a+crypto sha256-arm.c -o sha256.exe */ /* Visual Studio 2017 and above supports ARMv8, but its not clear how to detect */ /* it or use it at the moment. Also see http://stackoverflow.com/q/37244202, */ /* http://stackoverflow.com/q/41646026, and http://stackoverflow.com/q/41688101 */ #if defined(__arm__) || defined(__aarch32__) || defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM) # if defined(__GNUC__) # include # endif # if defined(__ARM_NEON) || defined(_MSC_VER) || defined(__GNUC__) # include # endif /* GCC and LLVM Clang, but not Apple Clang */ # if defined(__GNUC__) && !defined(__apple_build_version__) # if defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRYPTO) # include # endif # endif #endif /* ARM Headers */ #include #include // or #include #include "ee382n_bitcoin/sha256.h" #include static const uint32_t K[] = { 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2, }; /* Process multiple blocks. The caller is responsible for setting the initial */ /* state, and the caller is responsible for padding the final block. */ void sha256_process_arm(uint32_t state[8], const uint8_t data[], uint32_t length) { uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; uint32x4_t MSG0, MSG1, MSG2, MSG3; uint32x4_t TMP0, TMP1, TMP2; /* Load state */ STATE0 = vld1q_u32(&state[0]); STATE1 = vld1q_u32(&state[4]); while (length >= 64) { /* Save state */ ABEF_SAVE = STATE0; CDGH_SAVE = STATE1; /* Load message */ MSG0 = vld1q_u32((const uint32_t *)(data + 0)); MSG1 = vld1q_u32((const uint32_t *)(data + 16)); MSG2 = vld1q_u32((const uint32_t *)(data + 32)); MSG3 = vld1q_u32((const uint32_t *)(data + 48)); /* Reverse for little endian */ MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x00])); /* Rounds 0-3 */ MSG0 = vsha256su0q_u32(MSG0, MSG1); TMP2 = STATE0; TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x04])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); /* Rounds 4-7 */ MSG1 = vsha256su0q_u32(MSG1, MSG2); TMP2 = STATE0; TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x08])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); /* Rounds 8-11 */ MSG2 = vsha256su0q_u32(MSG2, MSG3); TMP2 = STATE0; TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x0c])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); /* Rounds 12-15 */ MSG3 = vsha256su0q_u32(MSG3, MSG0); TMP2 = STATE0; TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x10])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); /* Rounds 16-19 */ MSG0 = vsha256su0q_u32(MSG0, MSG1); TMP2 = STATE0; TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x14])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); /* Rounds 20-23 */ MSG1 = vsha256su0q_u32(MSG1, MSG2); TMP2 = STATE0; TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x18])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); /* Rounds 24-27 */ MSG2 = vsha256su0q_u32(MSG2, MSG3); TMP2 = STATE0; TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x1c])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); /* Rounds 28-31 */ MSG3 = vsha256su0q_u32(MSG3, MSG0); TMP2 = STATE0; TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x20])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); /* Rounds 32-35 */ MSG0 = vsha256su0q_u32(MSG0, MSG1); TMP2 = STATE0; TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x24])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); /* Rounds 36-39 */ MSG1 = vsha256su0q_u32(MSG1, MSG2); TMP2 = STATE0; TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x28])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); /* Rounds 40-43 */ MSG2 = vsha256su0q_u32(MSG2, MSG3); TMP2 = STATE0; TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x2c])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); /* Rounds 44-47 */ MSG3 = vsha256su0q_u32(MSG3, MSG0); TMP2 = STATE0; TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x30])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); /* Rounds 48-51 */ TMP2 = STATE0; TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x34])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); /* Rounds 52-55 */ TMP2 = STATE0; TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x38])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); /* Rounds 56-59 */ TMP2 = STATE0; TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x3c])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); /* Rounds 60-63 */ TMP2 = STATE0; STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); /* Combine state */ STATE0 = vaddq_u32(STATE0, ABEF_SAVE); STATE1 = vaddq_u32(STATE1, CDGH_SAVE); data += 64; length -= 64; } /* Save state */ vst1q_u32(&state[0], STATE0); vst1q_u32(&state[4], STATE1); } /** * @brief Resets the SHA256 state to initial values * * @param state Array of 8 uint32_t for maintaining state */ void sha256_init_arm(uint32_t state[8]) { static const uint32_t init_state[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; memcpy(state, init_state, sizeof(init_state)); } void sha256_pad_message(const uint8_t *input, size_t input_len, uint8_t *output, size_t *output_len) { size_t initial_len = input_len; size_t padding_len = 64 - (input_len % 64); // Adjusted to be multiple of 64 bytes // If padding_len is less than 9, then an additional block is needed for the length, so adjust padding_len accordingly if (padding_len < 9) { padding_len += 64; } // Calculate the new length for the padded message *output_len = input_len + padding_len; // Copy the input message to the output buffer memcpy(output, input, initial_len); // Append a single '1' bit output[initial_len] = 0x80; // Append zeros until the output is a multiple of 64 bytes for (size_t i = initial_len + 1; i < *output_len - 8; i++) { output[i] = 0x00; } // Append the length of the original message in bits as a 64-bit big-endian integer uint64_t bit_length = initial_len * 8; for (int i = 0; i < 8; i++) { output[*output_len - 8 + i] = (bit_length >> ((7 - i) * 8)) & 0xFF; } } void sha256_arm(const uint8_t *input, size_t length, uint8_t hash[SHA256_BLOCK_SIZE]) { uint32_t state[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; // printf("Input message:\n"); // for (size_t i = 0; i < length; i++) { // printf("%02X ", input[i]); // } // printf("\n"); // Prepare the padded message uint8_t buffer[1024]; // Ensure this buffer is large enough for the padded message size_t padded_length = 0; sha256_pad_message(input, length, buffer, &padded_length); // printf("Padded message:\n"); // for (size_t i = 0; i < padded_length; i++) { // printf("%02X ", buffer[i]); // } // printf("\n"); // Initialize state sha256_init_arm(state); // printf("Initial state:\n"); // for (int i = 0; i < 8; i++) { // printf("state[%d] = %08X\n", i, state[i]); // } // Process each 64-byte block // printf("Processing block \n"); sha256_process_arm(state, &buffer[0], padded_length); // printf("State after processing block:\n"); // for (int j = 0; j < 8; j++) { // printf("state[%d] = %08X\n", j, state[j]); // } // Convert the computed state to byte array in big endian for (int i = 0; i < 8; i++) { hash[i * 4 + 0] = (uint8_t)(state[i] >> 24); hash[i * 4 + 1] = (uint8_t)(state[i] >> 16); hash[i * 4 + 2] = (uint8_t)(state[i] >> 8); hash[i * 4 + 3] = (uint8_t)(state[i]); } } /** * Computes a double SHA-256 hash on given data using ARM architecture optimizations. The function first * hashes the input data, then hashes the result again. This method is commonly used in blockchain technologies. * * @param state Array of 8 uint32_t for the hash state, initialized prior to this call. * @param data Pointer to the data to be hashed. * @param len Length of the data in bytes. */ void sha256_double_arm(const uint8_t *input, size_t length, uint8_t hash[SHA256_BLOCK_SIZE]) { uint8_t intermediate_hash[SHA256_BLOCK_SIZE]; // Buffer to store intermediate hash // First hash sha256_arm(input, length, intermediate_hash); // Second hash sha256_arm(intermediate_hash, SHA256_BLOCK_SIZE, hash); } #if defined(TEST_MAIN) #include #include int main(int argc, char* argv[]) { /* empty message with padding */ uint8_t message[64]; memset(message, 0x00, sizeof(message)); message[0] = 0x80; /* initial state */ uint32_t state[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; sha256_process_arm(state, message, sizeof(message)); const uint8_t b1 = (uint8_t)(state[0] >> 24); const uint8_t b2 = (uint8_t)(state[0] >> 16); const uint8_t b3 = (uint8_t)(state[0] >> 8); const uint8_t b4 = (uint8_t)(state[0] >> 0); const uint8_t b5 = (uint8_t)(state[1] >> 24); const uint8_t b6 = (uint8_t)(state[1] >> 16); const uint8_t b7 = (uint8_t)(state[1] >> 8); const uint8_t b8 = (uint8_t)(state[1] >> 0); /* e3b0c44298fc1c14... */ printf("SHA256 hash of empty message: "); printf("%02X%02X%02X%02X%02X%02X%02X%02X...\n", b1, b2, b3, b4, b5, b6, b7, b8); int success = ((b1 == 0xE3) && (b2 == 0xB0) && (b3 == 0xC4) && (b4 == 0x42) && (b5 == 0x98) && (b6 == 0xFC) && (b7 == 0x1C) && (b8 == 0x14)); if (success) printf("Success!\n"); else printf("Failure!\n"); return (success != 0 ? 0 : 1); } #endif