Bitcoin Core  31.0.0
P2P Digital Currency
sha256_arm_shani.cpp
Go to the documentation of this file.
1 // Copyright (c) 2022-present The Bitcoin Core developers
2 // Distributed under the MIT software license, see the accompanying
3 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
4 //
5 // Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
6 // Written and placed in public domain by Jeffrey Walton.
7 // Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
8 // Barry O'Rourke for the mbedTLS project.
9 // Variant specialized for 64-byte inputs added by Pieter Wuille.
10 
11 #ifdef ENABLE_ARM_SHANI
12 
13 #include <array>
14 #include <cstdint>
15 #include <cstddef>
16 #include <arm_neon.h>
17 
18 namespace {
19 alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
20 {
21  0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
22  0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
23  0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
24  0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
25  0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
26  0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
27  0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
28  0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
29  0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
30  0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
31  0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
32  0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
33  0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
34  0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
35  0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
36  0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
37 };
38 }
39 
40 namespace sha256_arm_shani {
41 void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
42 {
43  uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
44  uint32x4_t MSG0, MSG1, MSG2, MSG3;
45  uint32x4_t TMP0, TMP2;
46 
47  // Load state
48  STATE0 = vld1q_u32(&s[0]);
49  STATE1 = vld1q_u32(&s[4]);
50 
51  while (blocks--)
52  {
53  // Save state
54  ABEF_SAVE = STATE0;
55  CDGH_SAVE = STATE1;
56 
57  // Load and convert input chunk to Big Endian
58  MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
59  MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
60  MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
61  MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
62  chunk += 64;
63 
64  // Original implementation preloaded message and constant addition which was 1-3% slower.
65  // Now included as first step in quad round code saving one Q Neon register
66  // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
67 
68  // Rounds 1-4
69  TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
70  TMP2 = STATE0;
71  MSG0 = vsha256su0q_u32(MSG0, MSG1);
72  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
73  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
74  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
75 
76  // Rounds 5-8
77  TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
78  TMP2 = STATE0;
79  MSG1 = vsha256su0q_u32(MSG1, MSG2);
80  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
81  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
82  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
83 
84  // Rounds 9-12
85  TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
86  TMP2 = STATE0;
87  MSG2 = vsha256su0q_u32(MSG2, MSG3);
88  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
89  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
90  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
91 
92  // Rounds 13-16
93  TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
94  TMP2 = STATE0;
95  MSG3 = vsha256su0q_u32(MSG3, MSG0);
96  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
97  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
98  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
99 
100  // Rounds 17-20
101  TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
102  TMP2 = STATE0;
103  MSG0 = vsha256su0q_u32(MSG0, MSG1);
104  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
105  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
106  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
107 
108  // Rounds 21-24
109  TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
110  TMP2 = STATE0;
111  MSG1 = vsha256su0q_u32(MSG1, MSG2);
112  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
113  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
114  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
115 
116  // Rounds 25-28
117  TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
118  TMP2 = STATE0;
119  MSG2 = vsha256su0q_u32(MSG2, MSG3);
120  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
121  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
122  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
123 
124  // Rounds 29-32
125  TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
126  TMP2 = STATE0;
127  MSG3 = vsha256su0q_u32(MSG3, MSG0);
128  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
129  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
130  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
131 
132  // Rounds 33-36
133  TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
134  TMP2 = STATE0;
135  MSG0 = vsha256su0q_u32(MSG0, MSG1);
136  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
137  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
138  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
139 
140  // Rounds 37-40
141  TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
142  TMP2 = STATE0;
143  MSG1 = vsha256su0q_u32(MSG1, MSG2);
144  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
145  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
146  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
147 
148  // Rounds 41-44
149  TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
150  TMP2 = STATE0;
151  MSG2 = vsha256su0q_u32(MSG2, MSG3);
152  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
153  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
154  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
155 
156  // Rounds 45-48
157  TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
158  TMP2 = STATE0;
159  MSG3 = vsha256su0q_u32(MSG3, MSG0);
160  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
161  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
162  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
163 
164  // Rounds 49-52
165  TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
166  TMP2 = STATE0;
167  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
168  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
169 
170  // Rounds 53-56
171  TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
172  TMP2 = STATE0;
173  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
174  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
175 
176  // Rounds 57-60
177  TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
178  TMP2 = STATE0;
179  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
180  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
181 
182  // Rounds 61-64
183  TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
184  TMP2 = STATE0;
185  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
186  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
187 
188  // Update state
189  STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
190  STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
191  }
192 
193  // Save final state
194  vst1q_u32(&s[0], STATE0);
195  vst1q_u32(&s[4], STATE1);
196 }
197 }
198 
199 namespace sha256d64_arm_shani {
200 void Transform_2way(unsigned char* output, const unsigned char* input)
201 {
202  /* Initial state. */
203  alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
204  0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
205  0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
206  };
207 
208  /* Precomputed message schedule for the 2nd transform. */
209  alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
210  0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
211  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
212  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
213  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
214  0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
215  0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
216  0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
217  0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
218  0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
219  0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
220  0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
221  0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
222  0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
223  0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
224  0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
225  0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
226  };
227 
228  /* A few precomputed message schedule values for the 3rd transform. */
229  alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
230  0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
231  0x80000000, 0x00000000, 0x00000000, 0x00000000,
232  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
233  };
234 
235  /* Padding processed in the 3rd transform (byteswapped). */
236  alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
237 
238  uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABEF_SAVEA, ABEF_SAVEB, CDGH_SAVEA, CDGH_SAVEB;
239  uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
240  uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
241 
242  // Transform 1: Load state
243  STATE0A = vld1q_u32(&INIT[0]);
244  STATE0B = STATE0A;
245  STATE1A = vld1q_u32(&INIT[4]);
246  STATE1B = STATE1A;
247 
248  // Transform 1: Load and convert input chunk to Big Endian
249  MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
250  MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
251  MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
252  MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
253  MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
254  MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
255  MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
256  MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
257 
258  // Transform 1: Rounds 1-4
259  TMP = vld1q_u32(&K[0]);
260  TMP0A = vaddq_u32(MSG0A, TMP);
261  TMP0B = vaddq_u32(MSG0B, TMP);
262  TMP2A = STATE0A;
263  TMP2B = STATE0B;
264  MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
265  MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
266  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
267  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
268  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
269  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
270  MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
271  MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
272 
273  // Transform 1: Rounds 5-8
274  TMP = vld1q_u32(&K[4]);
275  TMP0A = vaddq_u32(MSG1A, TMP);
276  TMP0B = vaddq_u32(MSG1B, TMP);
277  TMP2A = STATE0A;
278  TMP2B = STATE0B;
279  MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
280  MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
281  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
282  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
283  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
284  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
285  MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
286  MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
287 
288  // Transform 1: Rounds 9-12
289  TMP = vld1q_u32(&K[8]);
290  TMP0A = vaddq_u32(MSG2A, TMP);
291  TMP0B = vaddq_u32(MSG2B, TMP);
292  TMP2A = STATE0A;
293  TMP2B = STATE0B;
294  MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
295  MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
296  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
297  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
298  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
299  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
300  MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
301  MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
302 
303  // Transform 1: Rounds 13-16
304  TMP = vld1q_u32(&K[12]);
305  TMP0A = vaddq_u32(MSG3A, TMP);
306  TMP0B = vaddq_u32(MSG3B, TMP);
307  TMP2A = STATE0A;
308  TMP2B = STATE0B;
309  MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
310  MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
311  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
312  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
313  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
314  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
315  MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
316  MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
317 
318  // Transform 1: Rounds 17-20
319  TMP = vld1q_u32(&K[16]);
320  TMP0A = vaddq_u32(MSG0A, TMP);
321  TMP0B = vaddq_u32(MSG0B, TMP);
322  TMP2A = STATE0A;
323  TMP2B = STATE0B;
324  MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
325  MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
326  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
327  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
328  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
329  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
330  MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
331  MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
332 
333  // Transform 1: Rounds 21-24
334  TMP = vld1q_u32(&K[20]);
335  TMP0A = vaddq_u32(MSG1A, TMP);
336  TMP0B = vaddq_u32(MSG1B, TMP);
337  TMP2A = STATE0A;
338  TMP2B = STATE0B;
339  MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
340  MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
341  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
342  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
343  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
344  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
345  MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
346  MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
347 
348  // Transform 1: Rounds 25-28
349  TMP = vld1q_u32(&K[24]);
350  TMP0A = vaddq_u32(MSG2A, TMP);
351  TMP0B = vaddq_u32(MSG2B, TMP);
352  TMP2A = STATE0A;
353  TMP2B = STATE0B;
354  MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
355  MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
356  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
357  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
358  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
359  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
360  MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
361  MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
362 
363  // Transform 1: Rounds 29-32
364  TMP = vld1q_u32(&K[28]);
365  TMP0A = vaddq_u32(MSG3A, TMP);
366  TMP0B = vaddq_u32(MSG3B, TMP);
367  TMP2A = STATE0A;
368  TMP2B = STATE0B;
369  MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
370  MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
371  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
372  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
373  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
374  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
375  MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
376  MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
377 
378  // Transform 1: Rounds 33-36
379  TMP = vld1q_u32(&K[32]);
380  TMP0A = vaddq_u32(MSG0A, TMP);
381  TMP0B = vaddq_u32(MSG0B, TMP);
382  TMP2A = STATE0A;
383  TMP2B = STATE0B;
384  MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
385  MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
386  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
387  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
388  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
389  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
390  MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
391  MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
392 
393  // Transform 1: Rounds 37-40
394  TMP = vld1q_u32(&K[36]);
395  TMP0A = vaddq_u32(MSG1A, TMP);
396  TMP0B = vaddq_u32(MSG1B, TMP);
397  TMP2A = STATE0A;
398  TMP2B = STATE0B;
399  MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
400  MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
401  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
402  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
403  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
404  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
405  MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
406  MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
407 
408  // Transform 1: Rounds 41-44
409  TMP = vld1q_u32(&K[40]);
410  TMP0A = vaddq_u32(MSG2A, TMP);
411  TMP0B = vaddq_u32(MSG2B, TMP);
412  TMP2A = STATE0A;
413  TMP2B = STATE0B;
414  MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
415  MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
416  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
417  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
418  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
419  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
420  MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
421  MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
422 
423  // Transform 1: Rounds 45-48
424  TMP = vld1q_u32(&K[44]);
425  TMP0A = vaddq_u32(MSG3A, TMP);
426  TMP0B = vaddq_u32(MSG3B, TMP);
427  TMP2A = STATE0A;
428  TMP2B = STATE0B;
429  MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
430  MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
431  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
432  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
433  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
434  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
435  MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
436  MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
437 
438  // Transform 1: Rounds 49-52
439  TMP = vld1q_u32(&K[48]);
440  TMP0A = vaddq_u32(MSG0A, TMP);
441  TMP0B = vaddq_u32(MSG0B, TMP);
442  TMP2A = STATE0A;
443  TMP2B = STATE0B;
444  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
445  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
446  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
447  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
448 
449  // Transform 1: Rounds 53-56
450  TMP = vld1q_u32(&K[52]);
451  TMP0A = vaddq_u32(MSG1A, TMP);
452  TMP0B = vaddq_u32(MSG1B, TMP);
453  TMP2A = STATE0A;
454  TMP2B = STATE0B;
455  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
456  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
457  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
458  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
459 
460  // Transform 1: Rounds 57-60
461  TMP = vld1q_u32(&K[56]);
462  TMP0A = vaddq_u32(MSG2A, TMP);
463  TMP0B = vaddq_u32(MSG2B, TMP);
464  TMP2A = STATE0A;
465  TMP2B = STATE0B;
466  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
467  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
468  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
469  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
470 
471  // Transform 1: Rounds 61-64
472  TMP = vld1q_u32(&K[60]);
473  TMP0A = vaddq_u32(MSG3A, TMP);
474  TMP0B = vaddq_u32(MSG3B, TMP);
475  TMP2A = STATE0A;
476  TMP2B = STATE0B;
477  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
478  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
479  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
480  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
481 
482  // Transform 1: Update state
483  TMP = vld1q_u32(&INIT[0]);
484  STATE0A = vaddq_u32(STATE0A, TMP);
485  STATE0B = vaddq_u32(STATE0B, TMP);
486  TMP = vld1q_u32(&INIT[4]);
487  STATE1A = vaddq_u32(STATE1A, TMP);
488  STATE1B = vaddq_u32(STATE1B, TMP);
489 
490  // Transform 2: Save state
491  ABEF_SAVEA = STATE0A;
492  ABEF_SAVEB = STATE0B;
493  CDGH_SAVEA = STATE1A;
494  CDGH_SAVEB = STATE1B;
495 
496  // Transform 2: Rounds 1-4
497  TMP = vld1q_u32(&MIDS[0]);
498  TMP2A = STATE0A;
499  TMP2B = STATE0B;
500  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
501  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
502  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
503  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
504 
505  // Transform 2: Rounds 5-8
506  TMP = vld1q_u32(&MIDS[4]);
507  TMP2A = STATE0A;
508  TMP2B = STATE0B;
509  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
510  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
511  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
512  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
513 
514  // Transform 2: Rounds 9-12
515  TMP = vld1q_u32(&MIDS[8]);
516  TMP2A = STATE0A;
517  TMP2B = STATE0B;
518  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
519  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
520  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
521  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
522 
523  // Transform 2: Rounds 13-16
524  TMP = vld1q_u32(&MIDS[12]);
525  TMP2A = STATE0A;
526  TMP2B = STATE0B;
527  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
528  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
529  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
530  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
531 
532  // Transform 2: Rounds 17-20
533  TMP = vld1q_u32(&MIDS[16]);
534  TMP2A = STATE0A;
535  TMP2B = STATE0B;
536  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
537  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
538  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
539  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
540 
541  // Transform 2: Rounds 21-24
542  TMP = vld1q_u32(&MIDS[20]);
543  TMP2A = STATE0A;
544  TMP2B = STATE0B;
545  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
546  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
547  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
548  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
549 
550  // Transform 2: Rounds 25-28
551  TMP = vld1q_u32(&MIDS[24]);
552  TMP2A = STATE0A;
553  TMP2B = STATE0B;
554  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
555  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
556  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
557  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
558 
559  // Transform 2: Rounds 29-32
560  TMP = vld1q_u32(&MIDS[28]);
561  TMP2A = STATE0A;
562  TMP2B = STATE0B;
563  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
564  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
565  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
566  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
567 
568  // Transform 2: Rounds 33-36
569  TMP = vld1q_u32(&MIDS[32]);
570  TMP2A = STATE0A;
571  TMP2B = STATE0B;
572  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
573  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
574  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
575  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
576 
577  // Transform 2: Rounds 37-40
578  TMP = vld1q_u32(&MIDS[36]);
579  TMP2A = STATE0A;
580  TMP2B = STATE0B;
581  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
582  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
583  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
584  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
585 
586  // Transform 2: Rounds 41-44
587  TMP = vld1q_u32(&MIDS[40]);
588  TMP2A = STATE0A;
589  TMP2B = STATE0B;
590  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
591  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
592  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
593  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
594 
595  // Transform 2: Rounds 45-48
596  TMP = vld1q_u32(&MIDS[44]);
597  TMP2A = STATE0A;
598  TMP2B = STATE0B;
599  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
600  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
601  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
602  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
603 
604  // Transform 2: Rounds 49-52
605  TMP = vld1q_u32(&MIDS[48]);
606  TMP2A = STATE0A;
607  TMP2B = STATE0B;
608  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
609  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
610  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
611  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
612 
613  // Transform 2: Rounds 53-56
614  TMP = vld1q_u32(&MIDS[52]);
615  TMP2A = STATE0A;
616  TMP2B = STATE0B;
617  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
618  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
619  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
620  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
621 
622  // Transform 2: Rounds 57-60
623  TMP = vld1q_u32(&MIDS[56]);
624  TMP2A = STATE0A;
625  TMP2B = STATE0B;
626  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
627  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
628  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
629  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
630 
631  // Transform 2: Rounds 61-64
632  TMP = vld1q_u32(&MIDS[60]);
633  TMP2A = STATE0A;
634  TMP2B = STATE0B;
635  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
636  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
637  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
638  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
639 
640  // Transform 2: Update state
641  STATE0A = vaddq_u32(STATE0A, ABEF_SAVEA);
642  STATE0B = vaddq_u32(STATE0B, ABEF_SAVEB);
643  STATE1A = vaddq_u32(STATE1A, CDGH_SAVEA);
644  STATE1B = vaddq_u32(STATE1B, CDGH_SAVEB);
645 
646  // Transform 3: Pad previous output
647  MSG0A = STATE0A;
648  MSG0B = STATE0B;
649  MSG1A = STATE1A;
650  MSG1B = STATE1B;
651  MSG2A = vld1q_u32(&FINAL[0]);
652  MSG2B = MSG2A;
653  MSG3A = vld1q_u32(&FINAL[4]);
654  MSG3B = MSG3A;
655 
656  // Transform 3: Load state
657  STATE0A = vld1q_u32(&INIT[0]);
658  STATE0B = STATE0A;
659  STATE1A = vld1q_u32(&INIT[4]);
660  STATE1B = STATE1A;
661 
662  // Transform 3: Rounds 1-4
663  TMP = vld1q_u32(&K[0]);
664  TMP0A = vaddq_u32(MSG0A, TMP);
665  TMP0B = vaddq_u32(MSG0B, TMP);
666  TMP2A = STATE0A;
667  TMP2B = STATE0B;
668  MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
669  MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
670  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
671  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
672  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
673  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
674  MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
675  MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
676 
677  // Transform 3: Rounds 5-8
678  TMP = vld1q_u32(&K[4]);
679  TMP0A = vaddq_u32(MSG1A, TMP);
680  TMP0B = vaddq_u32(MSG1B, TMP);
681  TMP2A = STATE0A;
682  TMP2B = STATE0B;
683  MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
684  MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
685  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
686  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
687  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
688  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
689  MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
690  MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
691 
692  // Transform 3: Rounds 9-12
693  TMP = vld1q_u32(&FINS[0]);
694  TMP2A = STATE0A;
695  TMP2B = STATE0B;
696  MSG2A = vld1q_u32(&FINS[4]);
697  MSG2B = MSG2A;
698  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
699  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
700  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
701  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
702  MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
703  MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
704 
705  // Transform 3: Rounds 13-16
706  TMP = vld1q_u32(&FINS[8]);
707  TMP2A = STATE0A;
708  TMP2B = STATE0B;
709  MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
710  MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
711  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
712  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
713  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
714  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
715  MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
716  MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
717 
718  // Transform 3: Rounds 17-20
719  TMP = vld1q_u32(&K[16]);
720  TMP0A = vaddq_u32(MSG0A, TMP);
721  TMP0B = vaddq_u32(MSG0B, TMP);
722  TMP2A = STATE0A;
723  TMP2B = STATE0B;
724  MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
725  MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
726  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
727  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
728  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
729  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
730  MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
731  MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
732 
733  // Transform 3: Rounds 21-24
734  TMP = vld1q_u32(&K[20]);
735  TMP0A = vaddq_u32(MSG1A, TMP);
736  TMP0B = vaddq_u32(MSG1B, TMP);
737  TMP2A = STATE0A;
738  TMP2B = STATE0B;
739  MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
740  MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
741  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
742  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
743  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
744  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
745  MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
746  MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
747 
748  // Transform 3: Rounds 25-28
749  TMP = vld1q_u32(&K[24]);
750  TMP0A = vaddq_u32(MSG2A, TMP);
751  TMP0B = vaddq_u32(MSG2B, TMP);
752  TMP2A = STATE0A;
753  TMP2B = STATE0B;
754  MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
755  MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
756  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
757  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
758  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
759  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
760  MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
761  MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
762 
763  // Transform 3: Rounds 29-32
764  TMP = vld1q_u32(&K[28]);
765  TMP0A = vaddq_u32(MSG3A, TMP);
766  TMP0B = vaddq_u32(MSG3B, TMP);
767  TMP2A = STATE0A;
768  TMP2B = STATE0B;
769  MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
770  MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
771  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
772  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
773  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
774  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
775  MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
776  MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
777 
778  // Transform 3: Rounds 33-36
779  TMP = vld1q_u32(&K[32]);
780  TMP0A = vaddq_u32(MSG0A, TMP);
781  TMP0B = vaddq_u32(MSG0B, TMP);
782  TMP2A = STATE0A;
783  TMP2B = STATE0B;
784  MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
785  MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
786  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
787  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
788  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
789  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
790  MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
791  MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
792 
793  // Transform 3: Rounds 37-40
794  TMP = vld1q_u32(&K[36]);
795  TMP0A = vaddq_u32(MSG1A, TMP);
796  TMP0B = vaddq_u32(MSG1B, TMP);
797  TMP2A = STATE0A;
798  TMP2B = STATE0B;
799  MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
800  MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
801  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
802  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
803  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
804  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
805  MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
806  MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
807 
808  // Transform 3: Rounds 41-44
809  TMP = vld1q_u32(&K[40]);
810  TMP0A = vaddq_u32(MSG2A, TMP);
811  TMP0B = vaddq_u32(MSG2B, TMP);
812  TMP2A = STATE0A;
813  TMP2B = STATE0B;
814  MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
815  MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
816  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
817  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
818  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
819  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
820  MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
821  MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
822 
823  // Transform 3: Rounds 45-48
824  TMP = vld1q_u32(&K[44]);
825  TMP0A = vaddq_u32(MSG3A, TMP);
826  TMP0B = vaddq_u32(MSG3B, TMP);
827  TMP2A = STATE0A;
828  TMP2B = STATE0B;
829  MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
830  MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
831  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
832  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
833  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
834  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
835  MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
836  MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
837 
838  // Transform 3: Rounds 49-52
839  TMP = vld1q_u32(&K[48]);
840  TMP0A = vaddq_u32(MSG0A, TMP);
841  TMP0B = vaddq_u32(MSG0B, TMP);
842  TMP2A = STATE0A;
843  TMP2B = STATE0B;
844  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
845  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
846  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
847  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
848 
849  // Transform 3: Rounds 53-56
850  TMP = vld1q_u32(&K[52]);
851  TMP0A = vaddq_u32(MSG1A, TMP);
852  TMP0B = vaddq_u32(MSG1B, TMP);
853  TMP2A = STATE0A;
854  TMP2B = STATE0B;
855  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
856  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
857  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
858  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
859 
860  // Transform 3: Rounds 57-60
861  TMP = vld1q_u32(&K[56]);
862  TMP0A = vaddq_u32(MSG2A, TMP);
863  TMP0B = vaddq_u32(MSG2B, TMP);
864  TMP2A = STATE0A;
865  TMP2B = STATE0B;
866  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
867  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
868  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
869  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
870 
871  // Transform 3: Rounds 61-64
872  TMP = vld1q_u32(&K[60]);
873  TMP0A = vaddq_u32(MSG3A, TMP);
874  TMP0B = vaddq_u32(MSG3B, TMP);
875  TMP2A = STATE0A;
876  TMP2B = STATE0B;
877  STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
878  STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
879  STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
880  STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
881 
882  // Transform 3: Update state
883  TMP = vld1q_u32(&INIT[0]);
884  STATE0A = vaddq_u32(STATE0A, TMP);
885  STATE0B = vaddq_u32(STATE0B, TMP);
886  TMP = vld1q_u32(&INIT[4]);
887  STATE1A = vaddq_u32(STATE1A, TMP);
888  STATE1B = vaddq_u32(STATE1B, TMP);
889 
890  // Store result
891  vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
892  vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
893  vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
894  vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
895 }
896 }
897 
898 #endif
void Transform_2way(unsigned char *out, const unsigned char *in)
void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks)