Bitcoin Core 31.0.0
P2P Digital Currency
Loading...
Searching...
No Matches
sha256_arm_shani.cpp
Go to the documentation of this file.
1// Copyright (c) 2022-present The Bitcoin Core developers
2// Distributed under the MIT software license, see the accompanying
3// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4//
5// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
6// Written and placed in public domain by Jeffrey Walton.
7// Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
8// Barry O'Rourke for the mbedTLS project.
9// Variant specialized for 64-byte inputs added by Pieter Wuille.
10
11#ifdef ENABLE_ARM_SHANI
12
13#include <array>
14#include <cstdint>
15#include <cstddef>
16#include <arm_neon.h>
17
18namespace {
19alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
20{
21 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
22 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
23 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
24 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
25 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
26 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
27 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
28 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
29 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
30 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
31 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
32 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
33 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
34 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
35 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
36 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
37};
38}
39
40namespace sha256_arm_shani {
41void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
42{
46
47 // Load state
48 STATE0 = vld1q_u32(&s[0]);
49 STATE1 = vld1q_u32(&s[4]);
50
51 while (blocks--)
52 {
53 // Save state
56
57 // Load and convert input chunk to Big Endian
62 chunk += 64;
63
64 // Original implementation preloaded message and constant addition which was 1-3% slower.
65 // Now included as first step in quad round code saving one Q Neon register
66 // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
67
68 // Rounds 1-4
69 TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
70 TMP2 = STATE0;
75
76 // Rounds 5-8
77 TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
78 TMP2 = STATE0;
83
84 // Rounds 9-12
85 TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
86 TMP2 = STATE0;
91
92 // Rounds 13-16
93 TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
94 TMP2 = STATE0;
99
100 // Rounds 17-20
101 TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
102 TMP2 = STATE0;
107
108 // Rounds 21-24
109 TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
110 TMP2 = STATE0;
115
116 // Rounds 25-28
117 TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
118 TMP2 = STATE0;
123
124 // Rounds 29-32
125 TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
126 TMP2 = STATE0;
131
132 // Rounds 33-36
133 TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
134 TMP2 = STATE0;
139
140 // Rounds 37-40
141 TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
142 TMP2 = STATE0;
147
148 // Rounds 41-44
149 TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
150 TMP2 = STATE0;
155
156 // Rounds 45-48
157 TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
158 TMP2 = STATE0;
163
164 // Rounds 49-52
165 TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
166 TMP2 = STATE0;
169
170 // Rounds 53-56
171 TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
172 TMP2 = STATE0;
175
176 // Rounds 57-60
177 TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
178 TMP2 = STATE0;
181
182 // Rounds 61-64
183 TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
184 TMP2 = STATE0;
187
188 // Update state
191 }
192
193 // Save final state
194 vst1q_u32(&s[0], STATE0);
195 vst1q_u32(&s[4], STATE1);
196}
197}
198
199namespace sha256d64_arm_shani {
200void Transform_2way(unsigned char* output, const unsigned char* input)
201{
202 /* Initial state. */
203 alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
204 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
205 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
206 };
207
208 /* Precomputed message schedule for the 2nd transform. */
209 alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
210 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
211 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
212 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
213 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
214 0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
215 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
216 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
217 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
218 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
219 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
220 0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
221 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
222 0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
223 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
224 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
225 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
226 };
227
228 /* A few precomputed message schedule values for the 3rd transform. */
229 alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
230 0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
231 0x80000000, 0x00000000, 0x00000000, 0x00000000,
232 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
233 };
234
235 /* Padding processed in the 3rd transform (byteswapped). */
236 alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
237
241
242 // Transform 1: Load state
243 STATE0A = vld1q_u32(&INIT[0]);
245 STATE1A = vld1q_u32(&INIT[4]);
247
248 // Transform 1: Load and convert input chunk to Big Endian
257
258 // Transform 1: Rounds 1-4
259 TMP = vld1q_u32(&K[0]);
262 TMP2A = STATE0A;
263 TMP2B = STATE0B;
272
273 // Transform 1: Rounds 5-8
274 TMP = vld1q_u32(&K[4]);
277 TMP2A = STATE0A;
278 TMP2B = STATE0B;
287
288 // Transform 1: Rounds 9-12
289 TMP = vld1q_u32(&K[8]);
292 TMP2A = STATE0A;
293 TMP2B = STATE0B;
302
303 // Transform 1: Rounds 13-16
304 TMP = vld1q_u32(&K[12]);
307 TMP2A = STATE0A;
308 TMP2B = STATE0B;
317
318 // Transform 1: Rounds 17-20
319 TMP = vld1q_u32(&K[16]);
322 TMP2A = STATE0A;
323 TMP2B = STATE0B;
332
333 // Transform 1: Rounds 21-24
334 TMP = vld1q_u32(&K[20]);
337 TMP2A = STATE0A;
338 TMP2B = STATE0B;
347
348 // Transform 1: Rounds 25-28
349 TMP = vld1q_u32(&K[24]);
352 TMP2A = STATE0A;
353 TMP2B = STATE0B;
362
363 // Transform 1: Rounds 29-32
364 TMP = vld1q_u32(&K[28]);
367 TMP2A = STATE0A;
368 TMP2B = STATE0B;
377
378 // Transform 1: Rounds 33-36
379 TMP = vld1q_u32(&K[32]);
382 TMP2A = STATE0A;
383 TMP2B = STATE0B;
392
393 // Transform 1: Rounds 37-40
394 TMP = vld1q_u32(&K[36]);
397 TMP2A = STATE0A;
398 TMP2B = STATE0B;
407
408 // Transform 1: Rounds 41-44
409 TMP = vld1q_u32(&K[40]);
412 TMP2A = STATE0A;
413 TMP2B = STATE0B;
422
423 // Transform 1: Rounds 45-48
424 TMP = vld1q_u32(&K[44]);
427 TMP2A = STATE0A;
428 TMP2B = STATE0B;
437
438 // Transform 1: Rounds 49-52
439 TMP = vld1q_u32(&K[48]);
442 TMP2A = STATE0A;
443 TMP2B = STATE0B;
448
449 // Transform 1: Rounds 53-56
450 TMP = vld1q_u32(&K[52]);
453 TMP2A = STATE0A;
454 TMP2B = STATE0B;
459
460 // Transform 1: Rounds 57-60
461 TMP = vld1q_u32(&K[56]);
464 TMP2A = STATE0A;
465 TMP2B = STATE0B;
470
471 // Transform 1: Rounds 61-64
472 TMP = vld1q_u32(&K[60]);
475 TMP2A = STATE0A;
476 TMP2B = STATE0B;
481
482 // Transform 1: Update state
483 TMP = vld1q_u32(&INIT[0]);
486 TMP = vld1q_u32(&INIT[4]);
489
490 // Transform 2: Save state
495
496 // Transform 2: Rounds 1-4
497 TMP = vld1q_u32(&MIDS[0]);
498 TMP2A = STATE0A;
499 TMP2B = STATE0B;
504
505 // Transform 2: Rounds 5-8
506 TMP = vld1q_u32(&MIDS[4]);
507 TMP2A = STATE0A;
508 TMP2B = STATE0B;
513
514 // Transform 2: Rounds 9-12
515 TMP = vld1q_u32(&MIDS[8]);
516 TMP2A = STATE0A;
517 TMP2B = STATE0B;
522
523 // Transform 2: Rounds 13-16
524 TMP = vld1q_u32(&MIDS[12]);
525 TMP2A = STATE0A;
526 TMP2B = STATE0B;
531
532 // Transform 2: Rounds 17-20
533 TMP = vld1q_u32(&MIDS[16]);
534 TMP2A = STATE0A;
535 TMP2B = STATE0B;
540
541 // Transform 2: Rounds 21-24
542 TMP = vld1q_u32(&MIDS[20]);
543 TMP2A = STATE0A;
544 TMP2B = STATE0B;
549
550 // Transform 2: Rounds 25-28
551 TMP = vld1q_u32(&MIDS[24]);
552 TMP2A = STATE0A;
553 TMP2B = STATE0B;
558
559 // Transform 2: Rounds 29-32
560 TMP = vld1q_u32(&MIDS[28]);
561 TMP2A = STATE0A;
562 TMP2B = STATE0B;
567
568 // Transform 2: Rounds 33-36
569 TMP = vld1q_u32(&MIDS[32]);
570 TMP2A = STATE0A;
571 TMP2B = STATE0B;
576
577 // Transform 2: Rounds 37-40
578 TMP = vld1q_u32(&MIDS[36]);
579 TMP2A = STATE0A;
580 TMP2B = STATE0B;
585
586 // Transform 2: Rounds 41-44
587 TMP = vld1q_u32(&MIDS[40]);
588 TMP2A = STATE0A;
589 TMP2B = STATE0B;
594
595 // Transform 2: Rounds 45-48
596 TMP = vld1q_u32(&MIDS[44]);
597 TMP2A = STATE0A;
598 TMP2B = STATE0B;
603
604 // Transform 2: Rounds 49-52
605 TMP = vld1q_u32(&MIDS[48]);
606 TMP2A = STATE0A;
607 TMP2B = STATE0B;
612
613 // Transform 2: Rounds 53-56
614 TMP = vld1q_u32(&MIDS[52]);
615 TMP2A = STATE0A;
616 TMP2B = STATE0B;
621
622 // Transform 2: Rounds 57-60
623 TMP = vld1q_u32(&MIDS[56]);
624 TMP2A = STATE0A;
625 TMP2B = STATE0B;
630
631 // Transform 2: Rounds 61-64
632 TMP = vld1q_u32(&MIDS[60]);
633 TMP2A = STATE0A;
634 TMP2B = STATE0B;
639
640 // Transform 2: Update state
645
646 // Transform 3: Pad previous output
647 MSG0A = STATE0A;
648 MSG0B = STATE0B;
649 MSG1A = STATE1A;
650 MSG1B = STATE1B;
651 MSG2A = vld1q_u32(&FINAL[0]);
652 MSG2B = MSG2A;
653 MSG3A = vld1q_u32(&FINAL[4]);
654 MSG3B = MSG3A;
655
656 // Transform 3: Load state
657 STATE0A = vld1q_u32(&INIT[0]);
659 STATE1A = vld1q_u32(&INIT[4]);
661
662 // Transform 3: Rounds 1-4
663 TMP = vld1q_u32(&K[0]);
666 TMP2A = STATE0A;
667 TMP2B = STATE0B;
676
677 // Transform 3: Rounds 5-8
678 TMP = vld1q_u32(&K[4]);
681 TMP2A = STATE0A;
682 TMP2B = STATE0B;
691
692 // Transform 3: Rounds 9-12
693 TMP = vld1q_u32(&FINS[0]);
694 TMP2A = STATE0A;
695 TMP2B = STATE0B;
696 MSG2A = vld1q_u32(&FINS[4]);
697 MSG2B = MSG2A;
704
705 // Transform 3: Rounds 13-16
706 TMP = vld1q_u32(&FINS[8]);
707 TMP2A = STATE0A;
708 TMP2B = STATE0B;
717
718 // Transform 3: Rounds 17-20
719 TMP = vld1q_u32(&K[16]);
722 TMP2A = STATE0A;
723 TMP2B = STATE0B;
732
733 // Transform 3: Rounds 21-24
734 TMP = vld1q_u32(&K[20]);
737 TMP2A = STATE0A;
738 TMP2B = STATE0B;
747
748 // Transform 3: Rounds 25-28
749 TMP = vld1q_u32(&K[24]);
752 TMP2A = STATE0A;
753 TMP2B = STATE0B;
762
763 // Transform 3: Rounds 29-32
764 TMP = vld1q_u32(&K[28]);
767 TMP2A = STATE0A;
768 TMP2B = STATE0B;
777
778 // Transform 3: Rounds 33-36
779 TMP = vld1q_u32(&K[32]);
782 TMP2A = STATE0A;
783 TMP2B = STATE0B;
792
793 // Transform 3: Rounds 37-40
794 TMP = vld1q_u32(&K[36]);
797 TMP2A = STATE0A;
798 TMP2B = STATE0B;
807
808 // Transform 3: Rounds 41-44
809 TMP = vld1q_u32(&K[40]);
812 TMP2A = STATE0A;
813 TMP2B = STATE0B;
822
823 // Transform 3: Rounds 45-48
824 TMP = vld1q_u32(&K[44]);
827 TMP2A = STATE0A;
828 TMP2B = STATE0B;
837
838 // Transform 3: Rounds 49-52
839 TMP = vld1q_u32(&K[48]);
842 TMP2A = STATE0A;
843 TMP2B = STATE0B;
848
849 // Transform 3: Rounds 53-56
850 TMP = vld1q_u32(&K[52]);
853 TMP2A = STATE0A;
854 TMP2B = STATE0B;
859
860 // Transform 3: Rounds 57-60
861 TMP = vld1q_u32(&K[56]);
864 TMP2A = STATE0A;
865 TMP2B = STATE0B;
870
871 // Transform 3: Rounds 61-64
872 TMP = vld1q_u32(&K[60]);
875 TMP2A = STATE0A;
876 TMP2B = STATE0B;
881
882 // Transform 3: Update state
883 TMP = vld1q_u32(&INIT[0]);
886 TMP = vld1q_u32(&INIT[4]);
889
890 // Store result
895}
896}
897
898#endif
void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks)
void Transform_2way(unsigned char *out, const unsigned char *in)
@ FINAL
Neither this tx nor a mempool ancestor signals rbf.
constexpr auto Ticks(Dur2 d)
Helper to count the seconds of a duration/time_point.
Definition time.h:73