Bitcoin Core  29.1.0
P2P Digital Currency
sha256_sse4.cpp
Go to the documentation of this file.
1 // Copyright (c) 2017-2022 The Bitcoin Core developers
2 // Distributed under the MIT software license, see the accompanying
3 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
4 //
5 // This is a translation to GCC extended asm syntax from YASM code by Intel
6 // (available at the bottom of this file).
7 
8 #include <cstdlib>
9 #include <stdint.h>
10 
11 #if defined(__x86_64__) || defined(__amd64__)
12 
13 namespace sha256_sse4
14 {
15 void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
16 #if defined(__clang__)
17  /*
18  clang is unable to compile this with -O0 and -fsanitize=address.
19  See upstream bug: https://github.com/llvm/llvm-project/issues/92182.
20  This also fails to compile with -O2, -fcf-protection & -fsanitize=address.
21  See https://github.com/bitcoin/bitcoin/issues/31913.
22  */
23 #if __has_feature(address_sanitizer)
24  __attribute__((no_sanitize("address")))
25 #endif
26 #endif
27 {
28  static const uint32_t K256 alignas(16) [] = {
29  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
30  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
31  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
32  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
33  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
34  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
35  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
36  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
37  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
38  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
39  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
40  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
41  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
42  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
43  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
44  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
45  };
46  static const uint32_t FLIP_MASK alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
47  static const uint32_t SHUF_00BA alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
48  static const uint32_t SHUF_DC00 alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
49  uint32_t a, b, c, d, f, g, h, y0, y1, y2;
50  uint64_t tbl;
51  uint64_t inp_end, inp;
52  uint32_t xfer alignas(16) [4];
53 
54  __asm__ __volatile__(
55  "shl $0x6,%2;"
56  "je Ldone_hash_%=;"
57  "add %1,%2;"
58  "mov %2,%14;"
59  "mov (%0),%3;"
60  "mov 0x4(%0),%4;"
61  "mov 0x8(%0),%5;"
62  "mov 0xc(%0),%6;"
63  "mov 0x10(%0),%k2;"
64  "mov 0x14(%0),%7;"
65  "mov 0x18(%0),%8;"
66  "mov 0x1c(%0),%9;"
67  "movdqa %18,%%xmm12;"
68  "movdqa %19,%%xmm10;"
69  "movdqa %20,%%xmm11;"
70 
71  "Lloop0_%=:"
72  "lea %17,%13;"
73  "movdqu (%1),%%xmm4;"
74  "pshufb %%xmm12,%%xmm4;"
75  "movdqu 0x10(%1),%%xmm5;"
76  "pshufb %%xmm12,%%xmm5;"
77  "movdqu 0x20(%1),%%xmm6;"
78  "pshufb %%xmm12,%%xmm6;"
79  "movdqu 0x30(%1),%%xmm7;"
80  "pshufb %%xmm12,%%xmm7;"
81  "mov %1,%15;"
82  "mov $3,%1;"
83 
84  "Lloop1_%=:"
85  "movdqa 0x0(%13),%%xmm9;"
86  "paddd %%xmm4,%%xmm9;"
87  "movdqa %%xmm9,%16;"
88  "movdqa %%xmm7,%%xmm0;"
89  "mov %k2,%10;"
90  "ror $0xe,%10;"
91  "mov %3,%11;"
92  "palignr $0x4,%%xmm6,%%xmm0;"
93  "ror $0x9,%11;"
94  "xor %k2,%10;"
95  "mov %7,%12;"
96  "ror $0x5,%10;"
97  "movdqa %%xmm5,%%xmm1;"
98  "xor %3,%11;"
99  "xor %8,%12;"
100  "paddd %%xmm4,%%xmm0;"
101  "xor %k2,%10;"
102  "and %k2,%12;"
103  "ror $0xb,%11;"
104  "palignr $0x4,%%xmm4,%%xmm1;"
105  "xor %3,%11;"
106  "ror $0x6,%10;"
107  "xor %8,%12;"
108  "movdqa %%xmm1,%%xmm2;"
109  "ror $0x2,%11;"
110  "add %10,%12;"
111  "add %16,%12;"
112  "movdqa %%xmm1,%%xmm3;"
113  "mov %3,%10;"
114  "add %12,%9;"
115  "mov %3,%12;"
116  "pslld $0x19,%%xmm1;"
117  "or %5,%10;"
118  "add %9,%6;"
119  "and %5,%12;"
120  "psrld $0x7,%%xmm2;"
121  "and %4,%10;"
122  "add %11,%9;"
123  "por %%xmm2,%%xmm1;"
124  "or %12,%10;"
125  "add %10,%9;"
126  "movdqa %%xmm3,%%xmm2;"
127  "mov %6,%10;"
128  "mov %9,%11;"
129  "movdqa %%xmm3,%%xmm8;"
130  "ror $0xe,%10;"
131  "xor %6,%10;"
132  "mov %k2,%12;"
133  "ror $0x9,%11;"
134  "pslld $0xe,%%xmm3;"
135  "xor %9,%11;"
136  "ror $0x5,%10;"
137  "xor %7,%12;"
138  "psrld $0x12,%%xmm2;"
139  "ror $0xb,%11;"
140  "xor %6,%10;"
141  "and %6,%12;"
142  "ror $0x6,%10;"
143  "pxor %%xmm3,%%xmm1;"
144  "xor %9,%11;"
145  "xor %7,%12;"
146  "psrld $0x3,%%xmm8;"
147  "add %10,%12;"
148  "add 4+%16,%12;"
149  "ror $0x2,%11;"
150  "pxor %%xmm2,%%xmm1;"
151  "mov %9,%10;"
152  "add %12,%8;"
153  "mov %9,%12;"
154  "pxor %%xmm8,%%xmm1;"
155  "or %4,%10;"
156  "add %8,%5;"
157  "and %4,%12;"
158  "pshufd $0xfa,%%xmm7,%%xmm2;"
159  "and %3,%10;"
160  "add %11,%8;"
161  "paddd %%xmm1,%%xmm0;"
162  "or %12,%10;"
163  "add %10,%8;"
164  "movdqa %%xmm2,%%xmm3;"
165  "mov %5,%10;"
166  "mov %8,%11;"
167  "ror $0xe,%10;"
168  "movdqa %%xmm2,%%xmm8;"
169  "xor %5,%10;"
170  "ror $0x9,%11;"
171  "mov %6,%12;"
172  "xor %8,%11;"
173  "ror $0x5,%10;"
174  "psrlq $0x11,%%xmm2;"
175  "xor %k2,%12;"
176  "psrlq $0x13,%%xmm3;"
177  "xor %5,%10;"
178  "and %5,%12;"
179  "psrld $0xa,%%xmm8;"
180  "ror $0xb,%11;"
181  "xor %8,%11;"
182  "xor %k2,%12;"
183  "ror $0x6,%10;"
184  "pxor %%xmm3,%%xmm2;"
185  "add %10,%12;"
186  "ror $0x2,%11;"
187  "add 8+%16,%12;"
188  "pxor %%xmm2,%%xmm8;"
189  "mov %8,%10;"
190  "add %12,%7;"
191  "mov %8,%12;"
192  "pshufb %%xmm10,%%xmm8;"
193  "or %3,%10;"
194  "add %7,%4;"
195  "and %3,%12;"
196  "paddd %%xmm8,%%xmm0;"
197  "and %9,%10;"
198  "add %11,%7;"
199  "pshufd $0x50,%%xmm0,%%xmm2;"
200  "or %12,%10;"
201  "add %10,%7;"
202  "movdqa %%xmm2,%%xmm3;"
203  "mov %4,%10;"
204  "ror $0xe,%10;"
205  "mov %7,%11;"
206  "movdqa %%xmm2,%%xmm4;"
207  "ror $0x9,%11;"
208  "xor %4,%10;"
209  "mov %5,%12;"
210  "ror $0x5,%10;"
211  "psrlq $0x11,%%xmm2;"
212  "xor %7,%11;"
213  "xor %6,%12;"
214  "psrlq $0x13,%%xmm3;"
215  "xor %4,%10;"
216  "and %4,%12;"
217  "ror $0xb,%11;"
218  "psrld $0xa,%%xmm4;"
219  "xor %7,%11;"
220  "ror $0x6,%10;"
221  "xor %6,%12;"
222  "pxor %%xmm3,%%xmm2;"
223  "ror $0x2,%11;"
224  "add %10,%12;"
225  "add 12+%16,%12;"
226  "pxor %%xmm2,%%xmm4;"
227  "mov %7,%10;"
228  "add %12,%k2;"
229  "mov %7,%12;"
230  "pshufb %%xmm11,%%xmm4;"
231  "or %9,%10;"
232  "add %k2,%3;"
233  "and %9,%12;"
234  "paddd %%xmm0,%%xmm4;"
235  "and %8,%10;"
236  "add %11,%k2;"
237  "or %12,%10;"
238  "add %10,%k2;"
239  "movdqa 0x10(%13),%%xmm9;"
240  "paddd %%xmm5,%%xmm9;"
241  "movdqa %%xmm9,%16;"
242  "movdqa %%xmm4,%%xmm0;"
243  "mov %3,%10;"
244  "ror $0xe,%10;"
245  "mov %k2,%11;"
246  "palignr $0x4,%%xmm7,%%xmm0;"
247  "ror $0x9,%11;"
248  "xor %3,%10;"
249  "mov %4,%12;"
250  "ror $0x5,%10;"
251  "movdqa %%xmm6,%%xmm1;"
252  "xor %k2,%11;"
253  "xor %5,%12;"
254  "paddd %%xmm5,%%xmm0;"
255  "xor %3,%10;"
256  "and %3,%12;"
257  "ror $0xb,%11;"
258  "palignr $0x4,%%xmm5,%%xmm1;"
259  "xor %k2,%11;"
260  "ror $0x6,%10;"
261  "xor %5,%12;"
262  "movdqa %%xmm1,%%xmm2;"
263  "ror $0x2,%11;"
264  "add %10,%12;"
265  "add %16,%12;"
266  "movdqa %%xmm1,%%xmm3;"
267  "mov %k2,%10;"
268  "add %12,%6;"
269  "mov %k2,%12;"
270  "pslld $0x19,%%xmm1;"
271  "or %8,%10;"
272  "add %6,%9;"
273  "and %8,%12;"
274  "psrld $0x7,%%xmm2;"
275  "and %7,%10;"
276  "add %11,%6;"
277  "por %%xmm2,%%xmm1;"
278  "or %12,%10;"
279  "add %10,%6;"
280  "movdqa %%xmm3,%%xmm2;"
281  "mov %9,%10;"
282  "mov %6,%11;"
283  "movdqa %%xmm3,%%xmm8;"
284  "ror $0xe,%10;"
285  "xor %9,%10;"
286  "mov %3,%12;"
287  "ror $0x9,%11;"
288  "pslld $0xe,%%xmm3;"
289  "xor %6,%11;"
290  "ror $0x5,%10;"
291  "xor %4,%12;"
292  "psrld $0x12,%%xmm2;"
293  "ror $0xb,%11;"
294  "xor %9,%10;"
295  "and %9,%12;"
296  "ror $0x6,%10;"
297  "pxor %%xmm3,%%xmm1;"
298  "xor %6,%11;"
299  "xor %4,%12;"
300  "psrld $0x3,%%xmm8;"
301  "add %10,%12;"
302  "add 4+%16,%12;"
303  "ror $0x2,%11;"
304  "pxor %%xmm2,%%xmm1;"
305  "mov %6,%10;"
306  "add %12,%5;"
307  "mov %6,%12;"
308  "pxor %%xmm8,%%xmm1;"
309  "or %7,%10;"
310  "add %5,%8;"
311  "and %7,%12;"
312  "pshufd $0xfa,%%xmm4,%%xmm2;"
313  "and %k2,%10;"
314  "add %11,%5;"
315  "paddd %%xmm1,%%xmm0;"
316  "or %12,%10;"
317  "add %10,%5;"
318  "movdqa %%xmm2,%%xmm3;"
319  "mov %8,%10;"
320  "mov %5,%11;"
321  "ror $0xe,%10;"
322  "movdqa %%xmm2,%%xmm8;"
323  "xor %8,%10;"
324  "ror $0x9,%11;"
325  "mov %9,%12;"
326  "xor %5,%11;"
327  "ror $0x5,%10;"
328  "psrlq $0x11,%%xmm2;"
329  "xor %3,%12;"
330  "psrlq $0x13,%%xmm3;"
331  "xor %8,%10;"
332  "and %8,%12;"
333  "psrld $0xa,%%xmm8;"
334  "ror $0xb,%11;"
335  "xor %5,%11;"
336  "xor %3,%12;"
337  "ror $0x6,%10;"
338  "pxor %%xmm3,%%xmm2;"
339  "add %10,%12;"
340  "ror $0x2,%11;"
341  "add 8+%16,%12;"
342  "pxor %%xmm2,%%xmm8;"
343  "mov %5,%10;"
344  "add %12,%4;"
345  "mov %5,%12;"
346  "pshufb %%xmm10,%%xmm8;"
347  "or %k2,%10;"
348  "add %4,%7;"
349  "and %k2,%12;"
350  "paddd %%xmm8,%%xmm0;"
351  "and %6,%10;"
352  "add %11,%4;"
353  "pshufd $0x50,%%xmm0,%%xmm2;"
354  "or %12,%10;"
355  "add %10,%4;"
356  "movdqa %%xmm2,%%xmm3;"
357  "mov %7,%10;"
358  "ror $0xe,%10;"
359  "mov %4,%11;"
360  "movdqa %%xmm2,%%xmm5;"
361  "ror $0x9,%11;"
362  "xor %7,%10;"
363  "mov %8,%12;"
364  "ror $0x5,%10;"
365  "psrlq $0x11,%%xmm2;"
366  "xor %4,%11;"
367  "xor %9,%12;"
368  "psrlq $0x13,%%xmm3;"
369  "xor %7,%10;"
370  "and %7,%12;"
371  "ror $0xb,%11;"
372  "psrld $0xa,%%xmm5;"
373  "xor %4,%11;"
374  "ror $0x6,%10;"
375  "xor %9,%12;"
376  "pxor %%xmm3,%%xmm2;"
377  "ror $0x2,%11;"
378  "add %10,%12;"
379  "add 12+%16,%12;"
380  "pxor %%xmm2,%%xmm5;"
381  "mov %4,%10;"
382  "add %12,%3;"
383  "mov %4,%12;"
384  "pshufb %%xmm11,%%xmm5;"
385  "or %6,%10;"
386  "add %3,%k2;"
387  "and %6,%12;"
388  "paddd %%xmm0,%%xmm5;"
389  "and %5,%10;"
390  "add %11,%3;"
391  "or %12,%10;"
392  "add %10,%3;"
393  "movdqa 0x20(%13),%%xmm9;"
394  "paddd %%xmm6,%%xmm9;"
395  "movdqa %%xmm9,%16;"
396  "movdqa %%xmm5,%%xmm0;"
397  "mov %k2,%10;"
398  "ror $0xe,%10;"
399  "mov %3,%11;"
400  "palignr $0x4,%%xmm4,%%xmm0;"
401  "ror $0x9,%11;"
402  "xor %k2,%10;"
403  "mov %7,%12;"
404  "ror $0x5,%10;"
405  "movdqa %%xmm7,%%xmm1;"
406  "xor %3,%11;"
407  "xor %8,%12;"
408  "paddd %%xmm6,%%xmm0;"
409  "xor %k2,%10;"
410  "and %k2,%12;"
411  "ror $0xb,%11;"
412  "palignr $0x4,%%xmm6,%%xmm1;"
413  "xor %3,%11;"
414  "ror $0x6,%10;"
415  "xor %8,%12;"
416  "movdqa %%xmm1,%%xmm2;"
417  "ror $0x2,%11;"
418  "add %10,%12;"
419  "add %16,%12;"
420  "movdqa %%xmm1,%%xmm3;"
421  "mov %3,%10;"
422  "add %12,%9;"
423  "mov %3,%12;"
424  "pslld $0x19,%%xmm1;"
425  "or %5,%10;"
426  "add %9,%6;"
427  "and %5,%12;"
428  "psrld $0x7,%%xmm2;"
429  "and %4,%10;"
430  "add %11,%9;"
431  "por %%xmm2,%%xmm1;"
432  "or %12,%10;"
433  "add %10,%9;"
434  "movdqa %%xmm3,%%xmm2;"
435  "mov %6,%10;"
436  "mov %9,%11;"
437  "movdqa %%xmm3,%%xmm8;"
438  "ror $0xe,%10;"
439  "xor %6,%10;"
440  "mov %k2,%12;"
441  "ror $0x9,%11;"
442  "pslld $0xe,%%xmm3;"
443  "xor %9,%11;"
444  "ror $0x5,%10;"
445  "xor %7,%12;"
446  "psrld $0x12,%%xmm2;"
447  "ror $0xb,%11;"
448  "xor %6,%10;"
449  "and %6,%12;"
450  "ror $0x6,%10;"
451  "pxor %%xmm3,%%xmm1;"
452  "xor %9,%11;"
453  "xor %7,%12;"
454  "psrld $0x3,%%xmm8;"
455  "add %10,%12;"
456  "add 4+%16,%12;"
457  "ror $0x2,%11;"
458  "pxor %%xmm2,%%xmm1;"
459  "mov %9,%10;"
460  "add %12,%8;"
461  "mov %9,%12;"
462  "pxor %%xmm8,%%xmm1;"
463  "or %4,%10;"
464  "add %8,%5;"
465  "and %4,%12;"
466  "pshufd $0xfa,%%xmm5,%%xmm2;"
467  "and %3,%10;"
468  "add %11,%8;"
469  "paddd %%xmm1,%%xmm0;"
470  "or %12,%10;"
471  "add %10,%8;"
472  "movdqa %%xmm2,%%xmm3;"
473  "mov %5,%10;"
474  "mov %8,%11;"
475  "ror $0xe,%10;"
476  "movdqa %%xmm2,%%xmm8;"
477  "xor %5,%10;"
478  "ror $0x9,%11;"
479  "mov %6,%12;"
480  "xor %8,%11;"
481  "ror $0x5,%10;"
482  "psrlq $0x11,%%xmm2;"
483  "xor %k2,%12;"
484  "psrlq $0x13,%%xmm3;"
485  "xor %5,%10;"
486  "and %5,%12;"
487  "psrld $0xa,%%xmm8;"
488  "ror $0xb,%11;"
489  "xor %8,%11;"
490  "xor %k2,%12;"
491  "ror $0x6,%10;"
492  "pxor %%xmm3,%%xmm2;"
493  "add %10,%12;"
494  "ror $0x2,%11;"
495  "add 8+%16,%12;"
496  "pxor %%xmm2,%%xmm8;"
497  "mov %8,%10;"
498  "add %12,%7;"
499  "mov %8,%12;"
500  "pshufb %%xmm10,%%xmm8;"
501  "or %3,%10;"
502  "add %7,%4;"
503  "and %3,%12;"
504  "paddd %%xmm8,%%xmm0;"
505  "and %9,%10;"
506  "add %11,%7;"
507  "pshufd $0x50,%%xmm0,%%xmm2;"
508  "or %12,%10;"
509  "add %10,%7;"
510  "movdqa %%xmm2,%%xmm3;"
511  "mov %4,%10;"
512  "ror $0xe,%10;"
513  "mov %7,%11;"
514  "movdqa %%xmm2,%%xmm6;"
515  "ror $0x9,%11;"
516  "xor %4,%10;"
517  "mov %5,%12;"
518  "ror $0x5,%10;"
519  "psrlq $0x11,%%xmm2;"
520  "xor %7,%11;"
521  "xor %6,%12;"
522  "psrlq $0x13,%%xmm3;"
523  "xor %4,%10;"
524  "and %4,%12;"
525  "ror $0xb,%11;"
526  "psrld $0xa,%%xmm6;"
527  "xor %7,%11;"
528  "ror $0x6,%10;"
529  "xor %6,%12;"
530  "pxor %%xmm3,%%xmm2;"
531  "ror $0x2,%11;"
532  "add %10,%12;"
533  "add 12+%16,%12;"
534  "pxor %%xmm2,%%xmm6;"
535  "mov %7,%10;"
536  "add %12,%k2;"
537  "mov %7,%12;"
538  "pshufb %%xmm11,%%xmm6;"
539  "or %9,%10;"
540  "add %k2,%3;"
541  "and %9,%12;"
542  "paddd %%xmm0,%%xmm6;"
543  "and %8,%10;"
544  "add %11,%k2;"
545  "or %12,%10;"
546  "add %10,%k2;"
547  "movdqa 0x30(%13),%%xmm9;"
548  "paddd %%xmm7,%%xmm9;"
549  "movdqa %%xmm9,%16;"
550  "add $0x40,%13;"
551  "movdqa %%xmm6,%%xmm0;"
552  "mov %3,%10;"
553  "ror $0xe,%10;"
554  "mov %k2,%11;"
555  "palignr $0x4,%%xmm5,%%xmm0;"
556  "ror $0x9,%11;"
557  "xor %3,%10;"
558  "mov %4,%12;"
559  "ror $0x5,%10;"
560  "movdqa %%xmm4,%%xmm1;"
561  "xor %k2,%11;"
562  "xor %5,%12;"
563  "paddd %%xmm7,%%xmm0;"
564  "xor %3,%10;"
565  "and %3,%12;"
566  "ror $0xb,%11;"
567  "palignr $0x4,%%xmm7,%%xmm1;"
568  "xor %k2,%11;"
569  "ror $0x6,%10;"
570  "xor %5,%12;"
571  "movdqa %%xmm1,%%xmm2;"
572  "ror $0x2,%11;"
573  "add %10,%12;"
574  "add %16,%12;"
575  "movdqa %%xmm1,%%xmm3;"
576  "mov %k2,%10;"
577  "add %12,%6;"
578  "mov %k2,%12;"
579  "pslld $0x19,%%xmm1;"
580  "or %8,%10;"
581  "add %6,%9;"
582  "and %8,%12;"
583  "psrld $0x7,%%xmm2;"
584  "and %7,%10;"
585  "add %11,%6;"
586  "por %%xmm2,%%xmm1;"
587  "or %12,%10;"
588  "add %10,%6;"
589  "movdqa %%xmm3,%%xmm2;"
590  "mov %9,%10;"
591  "mov %6,%11;"
592  "movdqa %%xmm3,%%xmm8;"
593  "ror $0xe,%10;"
594  "xor %9,%10;"
595  "mov %3,%12;"
596  "ror $0x9,%11;"
597  "pslld $0xe,%%xmm3;"
598  "xor %6,%11;"
599  "ror $0x5,%10;"
600  "xor %4,%12;"
601  "psrld $0x12,%%xmm2;"
602  "ror $0xb,%11;"
603  "xor %9,%10;"
604  "and %9,%12;"
605  "ror $0x6,%10;"
606  "pxor %%xmm3,%%xmm1;"
607  "xor %6,%11;"
608  "xor %4,%12;"
609  "psrld $0x3,%%xmm8;"
610  "add %10,%12;"
611  "add 4+%16,%12;"
612  "ror $0x2,%11;"
613  "pxor %%xmm2,%%xmm1;"
614  "mov %6,%10;"
615  "add %12,%5;"
616  "mov %6,%12;"
617  "pxor %%xmm8,%%xmm1;"
618  "or %7,%10;"
619  "add %5,%8;"
620  "and %7,%12;"
621  "pshufd $0xfa,%%xmm6,%%xmm2;"
622  "and %k2,%10;"
623  "add %11,%5;"
624  "paddd %%xmm1,%%xmm0;"
625  "or %12,%10;"
626  "add %10,%5;"
627  "movdqa %%xmm2,%%xmm3;"
628  "mov %8,%10;"
629  "mov %5,%11;"
630  "ror $0xe,%10;"
631  "movdqa %%xmm2,%%xmm8;"
632  "xor %8,%10;"
633  "ror $0x9,%11;"
634  "mov %9,%12;"
635  "xor %5,%11;"
636  "ror $0x5,%10;"
637  "psrlq $0x11,%%xmm2;"
638  "xor %3,%12;"
639  "psrlq $0x13,%%xmm3;"
640  "xor %8,%10;"
641  "and %8,%12;"
642  "psrld $0xa,%%xmm8;"
643  "ror $0xb,%11;"
644  "xor %5,%11;"
645  "xor %3,%12;"
646  "ror $0x6,%10;"
647  "pxor %%xmm3,%%xmm2;"
648  "add %10,%12;"
649  "ror $0x2,%11;"
650  "add 8+%16,%12;"
651  "pxor %%xmm2,%%xmm8;"
652  "mov %5,%10;"
653  "add %12,%4;"
654  "mov %5,%12;"
655  "pshufb %%xmm10,%%xmm8;"
656  "or %k2,%10;"
657  "add %4,%7;"
658  "and %k2,%12;"
659  "paddd %%xmm8,%%xmm0;"
660  "and %6,%10;"
661  "add %11,%4;"
662  "pshufd $0x50,%%xmm0,%%xmm2;"
663  "or %12,%10;"
664  "add %10,%4;"
665  "movdqa %%xmm2,%%xmm3;"
666  "mov %7,%10;"
667  "ror $0xe,%10;"
668  "mov %4,%11;"
669  "movdqa %%xmm2,%%xmm7;"
670  "ror $0x9,%11;"
671  "xor %7,%10;"
672  "mov %8,%12;"
673  "ror $0x5,%10;"
674  "psrlq $0x11,%%xmm2;"
675  "xor %4,%11;"
676  "xor %9,%12;"
677  "psrlq $0x13,%%xmm3;"
678  "xor %7,%10;"
679  "and %7,%12;"
680  "ror $0xb,%11;"
681  "psrld $0xa,%%xmm7;"
682  "xor %4,%11;"
683  "ror $0x6,%10;"
684  "xor %9,%12;"
685  "pxor %%xmm3,%%xmm2;"
686  "ror $0x2,%11;"
687  "add %10,%12;"
688  "add 12+%16,%12;"
689  "pxor %%xmm2,%%xmm7;"
690  "mov %4,%10;"
691  "add %12,%3;"
692  "mov %4,%12;"
693  "pshufb %%xmm11,%%xmm7;"
694  "or %6,%10;"
695  "add %3,%k2;"
696  "and %6,%12;"
697  "paddd %%xmm0,%%xmm7;"
698  "and %5,%10;"
699  "add %11,%3;"
700  "or %12,%10;"
701  "add %10,%3;"
702  "sub $0x1,%1;"
703  "jne Lloop1_%=;"
704  "mov $0x2,%1;"
705 
706  "Lloop2_%=:"
707  "paddd 0x0(%13),%%xmm4;"
708  "movdqa %%xmm4,%16;"
709  "mov %k2,%10;"
710  "ror $0xe,%10;"
711  "mov %3,%11;"
712  "xor %k2,%10;"
713  "ror $0x9,%11;"
714  "mov %7,%12;"
715  "xor %3,%11;"
716  "ror $0x5,%10;"
717  "xor %8,%12;"
718  "xor %k2,%10;"
719  "ror $0xb,%11;"
720  "and %k2,%12;"
721  "xor %3,%11;"
722  "ror $0x6,%10;"
723  "xor %8,%12;"
724  "add %10,%12;"
725  "ror $0x2,%11;"
726  "add %16,%12;"
727  "mov %3,%10;"
728  "add %12,%9;"
729  "mov %3,%12;"
730  "or %5,%10;"
731  "add %9,%6;"
732  "and %5,%12;"
733  "and %4,%10;"
734  "add %11,%9;"
735  "or %12,%10;"
736  "add %10,%9;"
737  "mov %6,%10;"
738  "ror $0xe,%10;"
739  "mov %9,%11;"
740  "xor %6,%10;"
741  "ror $0x9,%11;"
742  "mov %k2,%12;"
743  "xor %9,%11;"
744  "ror $0x5,%10;"
745  "xor %7,%12;"
746  "xor %6,%10;"
747  "ror $0xb,%11;"
748  "and %6,%12;"
749  "xor %9,%11;"
750  "ror $0x6,%10;"
751  "xor %7,%12;"
752  "add %10,%12;"
753  "ror $0x2,%11;"
754  "add 4+%16,%12;"
755  "mov %9,%10;"
756  "add %12,%8;"
757  "mov %9,%12;"
758  "or %4,%10;"
759  "add %8,%5;"
760  "and %4,%12;"
761  "and %3,%10;"
762  "add %11,%8;"
763  "or %12,%10;"
764  "add %10,%8;"
765  "mov %5,%10;"
766  "ror $0xe,%10;"
767  "mov %8,%11;"
768  "xor %5,%10;"
769  "ror $0x9,%11;"
770  "mov %6,%12;"
771  "xor %8,%11;"
772  "ror $0x5,%10;"
773  "xor %k2,%12;"
774  "xor %5,%10;"
775  "ror $0xb,%11;"
776  "and %5,%12;"
777  "xor %8,%11;"
778  "ror $0x6,%10;"
779  "xor %k2,%12;"
780  "add %10,%12;"
781  "ror $0x2,%11;"
782  "add 8+%16,%12;"
783  "mov %8,%10;"
784  "add %12,%7;"
785  "mov %8,%12;"
786  "or %3,%10;"
787  "add %7,%4;"
788  "and %3,%12;"
789  "and %9,%10;"
790  "add %11,%7;"
791  "or %12,%10;"
792  "add %10,%7;"
793  "mov %4,%10;"
794  "ror $0xe,%10;"
795  "mov %7,%11;"
796  "xor %4,%10;"
797  "ror $0x9,%11;"
798  "mov %5,%12;"
799  "xor %7,%11;"
800  "ror $0x5,%10;"
801  "xor %6,%12;"
802  "xor %4,%10;"
803  "ror $0xb,%11;"
804  "and %4,%12;"
805  "xor %7,%11;"
806  "ror $0x6,%10;"
807  "xor %6,%12;"
808  "add %10,%12;"
809  "ror $0x2,%11;"
810  "add 12+%16,%12;"
811  "mov %7,%10;"
812  "add %12,%k2;"
813  "mov %7,%12;"
814  "or %9,%10;"
815  "add %k2,%3;"
816  "and %9,%12;"
817  "and %8,%10;"
818  "add %11,%k2;"
819  "or %12,%10;"
820  "add %10,%k2;"
821  "paddd 0x10(%13),%%xmm5;"
822  "movdqa %%xmm5,%16;"
823  "add $0x20,%13;"
824  "mov %3,%10;"
825  "ror $0xe,%10;"
826  "mov %k2,%11;"
827  "xor %3,%10;"
828  "ror $0x9,%11;"
829  "mov %4,%12;"
830  "xor %k2,%11;"
831  "ror $0x5,%10;"
832  "xor %5,%12;"
833  "xor %3,%10;"
834  "ror $0xb,%11;"
835  "and %3,%12;"
836  "xor %k2,%11;"
837  "ror $0x6,%10;"
838  "xor %5,%12;"
839  "add %10,%12;"
840  "ror $0x2,%11;"
841  "add %16,%12;"
842  "mov %k2,%10;"
843  "add %12,%6;"
844  "mov %k2,%12;"
845  "or %8,%10;"
846  "add %6,%9;"
847  "and %8,%12;"
848  "and %7,%10;"
849  "add %11,%6;"
850  "or %12,%10;"
851  "add %10,%6;"
852  "mov %9,%10;"
853  "ror $0xe,%10;"
854  "mov %6,%11;"
855  "xor %9,%10;"
856  "ror $0x9,%11;"
857  "mov %3,%12;"
858  "xor %6,%11;"
859  "ror $0x5,%10;"
860  "xor %4,%12;"
861  "xor %9,%10;"
862  "ror $0xb,%11;"
863  "and %9,%12;"
864  "xor %6,%11;"
865  "ror $0x6,%10;"
866  "xor %4,%12;"
867  "add %10,%12;"
868  "ror $0x2,%11;"
869  "add 4+%16,%12;"
870  "mov %6,%10;"
871  "add %12,%5;"
872  "mov %6,%12;"
873  "or %7,%10;"
874  "add %5,%8;"
875  "and %7,%12;"
876  "and %k2,%10;"
877  "add %11,%5;"
878  "or %12,%10;"
879  "add %10,%5;"
880  "mov %8,%10;"
881  "ror $0xe,%10;"
882  "mov %5,%11;"
883  "xor %8,%10;"
884  "ror $0x9,%11;"
885  "mov %9,%12;"
886  "xor %5,%11;"
887  "ror $0x5,%10;"
888  "xor %3,%12;"
889  "xor %8,%10;"
890  "ror $0xb,%11;"
891  "and %8,%12;"
892  "xor %5,%11;"
893  "ror $0x6,%10;"
894  "xor %3,%12;"
895  "add %10,%12;"
896  "ror $0x2,%11;"
897  "add 8+%16,%12;"
898  "mov %5,%10;"
899  "add %12,%4;"
900  "mov %5,%12;"
901  "or %k2,%10;"
902  "add %4,%7;"
903  "and %k2,%12;"
904  "and %6,%10;"
905  "add %11,%4;"
906  "or %12,%10;"
907  "add %10,%4;"
908  "mov %7,%10;"
909  "ror $0xe,%10;"
910  "mov %4,%11;"
911  "xor %7,%10;"
912  "ror $0x9,%11;"
913  "mov %8,%12;"
914  "xor %4,%11;"
915  "ror $0x5,%10;"
916  "xor %9,%12;"
917  "xor %7,%10;"
918  "ror $0xb,%11;"
919  "and %7,%12;"
920  "xor %4,%11;"
921  "ror $0x6,%10;"
922  "xor %9,%12;"
923  "add %10,%12;"
924  "ror $0x2,%11;"
925  "add 12+%16,%12;"
926  "mov %4,%10;"
927  "add %12,%3;"
928  "mov %4,%12;"
929  "or %6,%10;"
930  "add %3,%k2;"
931  "and %6,%12;"
932  "and %5,%10;"
933  "add %11,%3;"
934  "or %12,%10;"
935  "add %10,%3;"
936  "movdqa %%xmm6,%%xmm4;"
937  "movdqa %%xmm7,%%xmm5;"
938  "sub $0x1,%1;"
939  "jne Lloop2_%=;"
940  "add (%0),%3;"
941  "mov %3,(%0);"
942  "add 0x4(%0),%4;"
943  "mov %4,0x4(%0);"
944  "add 0x8(%0),%5;"
945  "mov %5,0x8(%0);"
946  "add 0xc(%0),%6;"
947  "mov %6,0xc(%0);"
948  "add 0x10(%0),%k2;"
949  "mov %k2,0x10(%0);"
950  "add 0x14(%0),%7;"
951  "mov %7,0x14(%0);"
952  "add 0x18(%0),%8;"
953  "mov %8,0x18(%0);"
954  "add 0x1c(%0),%9;"
955  "mov %9,0x1c(%0);"
956  "mov %15,%1;"
957  "add $0x40,%1;"
958  "cmp %14,%1;"
959  "jne Lloop0_%=;"
960 
961  "Ldone_hash_%=:"
962 
963  : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
964  : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
965  : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
966  );
967 }
968 }
969 
970 /*
971 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
972 ; Copyright (c) 2012, Intel Corporation
973 ;
974 ; All rights reserved.
975 ;
976 ; Redistribution and use in source and binary forms, with or without
977 ; modification, are permitted provided that the following conditions are
978 ; met:
979 ;
980 ; * Redistributions of source code must retain the above copyright
981 ; notice, this list of conditions and the following disclaimer.
982 ;
983 ; * Redistributions in binary form must reproduce the above copyright
984 ; notice, this list of conditions and the following disclaimer in the
985 ; documentation and/or other materials provided with the
986 ; distribution.
987 ;
988 ; * Neither the name of the Intel Corporation nor the names of its
989 ; contributors may be used to endorse or promote products derived from
990 ; this software without specific prior written permission.
991 ;
992 ;
993 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
994 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
995 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
996 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
997 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
998 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
999 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
1000 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
1001 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
1002 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1003 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1004 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1005 ;
1006 ; Example YASM command lines:
1007 ; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
1008 ; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
1009 ;
1010 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1011 ;
1012 ; This code is described in an Intel White-Paper:
1013 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
1014 ;
1015 ; To find it, surf to https://www.intel.com/p/en_US/embedded
1016 ; and search for that title.
1017 ; The paper is expected to be released roughly at the end of April, 2012
1018 ;
1019 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1020 ; This code schedules 1 blocks at a time, with 4 lanes per block
1021 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1022 
1023 %define MOVDQ movdqu ;; assume buffers not aligned
1024 
1025 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1026 
1027 ; addm [mem], reg
1028 ; Add reg to mem using reg-mem add and store
1029 %macro addm 2
1030  add %2, %1
1031  mov %1, %2
1032 %endm
1033 
1034 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1035 
1036 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1037 ; Load xmm with mem and byte swap each dword
1038 %macro COPY_XMM_AND_BSWAP 3
1039  MOVDQ %1, %2
1040  pshufb %1, %3
1041 %endmacro
1042 
1043 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1044 
1045 %define X0 xmm4
1046 %define X1 xmm5
1047 %define X2 xmm6
1048 %define X3 xmm7
1049 
1050 %define XTMP0 xmm0
1051 %define XTMP1 xmm1
1052 %define XTMP2 xmm2
1053 %define XTMP3 xmm3
1054 %define XTMP4 xmm8
1055 %define XFER xmm9
1056 
1057 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
1058 %define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
1059 %define BYTE_FLIP_MASK xmm12
1060 
1061 %ifdef LINUX
1062 %define NUM_BLKS rdx ; 3rd arg
1063 %define CTX rsi ; 2nd arg
1064 %define INP rdi ; 1st arg
1065 
1066 %define SRND rdi ; clobbers INP
1067 %define c ecx
1068 %define d r8d
1069 %define e edx
1070 %else
1071 %define NUM_BLKS r8 ; 3rd arg
1072 %define CTX rdx ; 2nd arg
1073 %define INP rcx ; 1st arg
1074 
1075 %define SRND rcx ; clobbers INP
1076 %define c edi
1077 %define d esi
1078 %define e r8d
1079 
1080 %endif
1081 %define TBL rbp
1082 %define a eax
1083 %define b ebx
1084 
1085 %define f r9d
1086 %define g r10d
1087 %define h r11d
1088 
1089 %define y0 r13d
1090 %define y1 r14d
1091 %define y2 r15d
1092 
1093 
1094 
1095 _INP_END_SIZE equ 8
1096 _INP_SIZE equ 8
1097 _XFER_SIZE equ 8
1098 %ifdef LINUX
1099 _XMM_SAVE_SIZE equ 0
1100 %else
1101 _XMM_SAVE_SIZE equ 7*16
1102 %endif
1103 ; STACK_SIZE plus pushes must be an odd multiple of 8
1104 _ALIGN_SIZE equ 8
1105 
1106 _INP_END equ 0
1107 _INP equ _INP_END + _INP_END_SIZE
1108 _XFER equ _INP + _INP_SIZE
1109 _XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
1110 STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
1111 
1112 ; rotate_Xs
1113 ; Rotate values of symbols X0...X3
1114 %macro rotate_Xs 0
1115 %xdefine X_ X0
1116 %xdefine X0 X1
1117 %xdefine X1 X2
1118 %xdefine X2 X3
1119 %xdefine X3 X_
1120 %endm
1121 
1122 ; ROTATE_ARGS
1123 ; Rotate values of symbols a...h
1124 %macro ROTATE_ARGS 0
1125 %xdefine TMP_ h
1126 %xdefine h g
1127 %xdefine g f
1128 %xdefine f e
1129 %xdefine e d
1130 %xdefine d c
1131 %xdefine c b
1132 %xdefine b a
1133 %xdefine a TMP_
1134 %endm
1135 
1136 %macro FOUR_ROUNDS_AND_SCHED 0
1137  ;; compute s0 four at a time and s1 two at a time
1138  ;; compute W[-16] + W[-7] 4 at a time
1139  movdqa XTMP0, X3
1140  mov y0, e ; y0 = e
1141  ror y0, (25-11) ; y0 = e >> (25-11)
1142  mov y1, a ; y1 = a
1143  palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
1144  ror y1, (22-13) ; y1 = a >> (22-13)
1145  xor y0, e ; y0 = e ^ (e >> (25-11))
1146  mov y2, f ; y2 = f
1147  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1148  movdqa XTMP1, X1
1149  xor y1, a ; y1 = a ^ (a >> (22-13)
1150  xor y2, g ; y2 = f^g
1151  paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
1152  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1153  and y2, e ; y2 = (f^g)&e
1154  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1155  ;; compute s0
1156  palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
1157  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1158  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1159  xor y2, g ; y2 = CH = ((f^g)&e)^g
1160  movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
1161  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1162  add y2, y0 ; y2 = S1 + CH
1163  add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
1164  movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
1165  mov y0, a ; y0 = a
1166  add h, y2 ; h = h + S1 + CH + k + w
1167  mov y2, a ; y2 = a
1168  pslld XTMP1, (32-7)
1169  or y0, c ; y0 = a|c
1170  add d, h ; d = d + h + S1 + CH + k + w
1171  and y2, c ; y2 = a&c
1172  psrld XTMP2, 7
1173  and y0, b ; y0 = (a|c)&b
1174  add h, y1 ; h = h + S1 + CH + k + w + S0
1175  por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
1176  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1177  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1178 
1179 ROTATE_ARGS
1180  movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
1181  mov y0, e ; y0 = e
1182  mov y1, a ; y1 = a
1183  movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
1184  ror y0, (25-11) ; y0 = e >> (25-11)
1185  xor y0, e ; y0 = e ^ (e >> (25-11))
1186  mov y2, f ; y2 = f
1187  ror y1, (22-13) ; y1 = a >> (22-13)
1188  pslld XTMP3, (32-18)
1189  xor y1, a ; y1 = a ^ (a >> (22-13)
1190  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1191  xor y2, g ; y2 = f^g
1192  psrld XTMP2, 18
1193  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1194  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1195  and y2, e ; y2 = (f^g)&e
1196  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1197  pxor XTMP1, XTMP3
1198  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1199  xor y2, g ; y2 = CH = ((f^g)&e)^g
1200  psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
1201  add y2, y0 ; y2 = S1 + CH
1202  add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
1203  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1204  pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1205  mov y0, a ; y0 = a
1206  add h, y2 ; h = h + S1 + CH + k + w
1207  mov y2, a ; y2 = a
1208  pxor XTMP1, XTMP4 ; XTMP1 = s0
1209  or y0, c ; y0 = a|c
1210  add d, h ; d = d + h + S1 + CH + k + w
1211  and y2, c ; y2 = a&c
1212  ;; compute low s1
1213  pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
1214  and y0, b ; y0 = (a|c)&b
1215  add h, y1 ; h = h + S1 + CH + k + w + S0
1216  paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
1217  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1218  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1219 
1220 ROTATE_ARGS
1221  movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
1222  mov y0, e ; y0 = e
1223  mov y1, a ; y1 = a
1224  ror y0, (25-11) ; y0 = e >> (25-11)
1225  movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
1226  xor y0, e ; y0 = e ^ (e >> (25-11))
1227  ror y1, (22-13) ; y1 = a >> (22-13)
1228  mov y2, f ; y2 = f
1229  xor y1, a ; y1 = a ^ (a >> (22-13)
1230  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1231  psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
1232  xor y2, g ; y2 = f^g
1233  psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
1234  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1235  and y2, e ; y2 = (f^g)&e
1236  psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
1237  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1238  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1239  xor y2, g ; y2 = CH = ((f^g)&e)^g
1240  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1241  pxor XTMP2, XTMP3
1242  add y2, y0 ; y2 = S1 + CH
1243  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1244  add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
1245  pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
1246  mov y0, a ; y0 = a
1247  add h, y2 ; h = h + S1 + CH + k + w
1248  mov y2, a ; y2 = a
1249  pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
1250  or y0, c ; y0 = a|c
1251  add d, h ; d = d + h + S1 + CH + k + w
1252  and y2, c ; y2 = a&c
1253  paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
1254  and y0, b ; y0 = (a|c)&b
1255  add h, y1 ; h = h + S1 + CH + k + w + S0
1256  ;; compute high s1
1257  pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
1258  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1259  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1260 
1261 ROTATE_ARGS
1262  movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
1263  mov y0, e ; y0 = e
1264  ror y0, (25-11) ; y0 = e >> (25-11)
1265  mov y1, a ; y1 = a
1266  movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
1267  ror y1, (22-13) ; y1 = a >> (22-13)
1268  xor y0, e ; y0 = e ^ (e >> (25-11))
1269  mov y2, f ; y2 = f
1270  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1271  psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
1272  xor y1, a ; y1 = a ^ (a >> (22-13)
1273  xor y2, g ; y2 = f^g
1274  psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
1275  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1276  and y2, e ; y2 = (f^g)&e
1277  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1278  psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
1279  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1280  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1281  xor y2, g ; y2 = CH = ((f^g)&e)^g
1282  pxor XTMP2, XTMP3
1283  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1284  add y2, y0 ; y2 = S1 + CH
1285  add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
1286  pxor X0, XTMP2 ; X0 = s1 {xDxC}
1287  mov y0, a ; y0 = a
1288  add h, y2 ; h = h + S1 + CH + k + w
1289  mov y2, a ; y2 = a
1290  pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
1291  or y0, c ; y0 = a|c
1292  add d, h ; d = d + h + S1 + CH + k + w
1293  and y2, c ; y2 = a&c
1294  paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
1295  and y0, b ; y0 = (a|c)&b
1296  add h, y1 ; h = h + S1 + CH + k + w + S0
1297  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1298  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1299 
1300 ROTATE_ARGS
1301 rotate_Xs
1302 %endm
1303 
1304 ;; input is [rsp + _XFER + %1 * 4]
1305 %macro DO_ROUND 1
1306  mov y0, e ; y0 = e
1307  ror y0, (25-11) ; y0 = e >> (25-11)
1308  mov y1, a ; y1 = a
1309  xor y0, e ; y0 = e ^ (e >> (25-11))
1310  ror y1, (22-13) ; y1 = a >> (22-13)
1311  mov y2, f ; y2 = f
1312  xor y1, a ; y1 = a ^ (a >> (22-13)
1313  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1314  xor y2, g ; y2 = f^g
1315  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1316  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1317  and y2, e ; y2 = (f^g)&e
1318  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1319  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1320  xor y2, g ; y2 = CH = ((f^g)&e)^g
1321  add y2, y0 ; y2 = S1 + CH
1322  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1323  add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
1324  mov y0, a ; y0 = a
1325  add h, y2 ; h = h + S1 + CH + k + w
1326  mov y2, a ; y2 = a
1327  or y0, c ; y0 = a|c
1328  add d, h ; d = d + h + S1 + CH + k + w
1329  and y2, c ; y2 = a&c
1330  and y0, b ; y0 = (a|c)&b
1331  add h, y1 ; h = h + S1 + CH + k + w + S0
1332  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1333  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1334  ROTATE_ARGS
1335 %endm
1336 
1337 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1338 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1339 ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1340 ;; arg 1 : pointer to input data
1341 ;; arg 2 : pointer to digest
1342 ;; arg 3 : Num blocks
1343 section .text
1344 global sha256_sse4
1345 align 32
1346 sha256_sse4:
1347  push rbx
1348 %ifndef LINUX
1349  push rsi
1350  push rdi
1351 %endif
1352  push rbp
1353  push r13
1354  push r14
1355  push r15
1356 
1357  sub rsp,STACK_SIZE
1358 %ifndef LINUX
1359  movdqa [rsp + _XMM_SAVE + 0*16],xmm6
1360  movdqa [rsp + _XMM_SAVE + 1*16],xmm7
1361  movdqa [rsp + _XMM_SAVE + 2*16],xmm8
1362  movdqa [rsp + _XMM_SAVE + 3*16],xmm9
1363  movdqa [rsp + _XMM_SAVE + 4*16],xmm10
1364  movdqa [rsp + _XMM_SAVE + 5*16],xmm11
1365  movdqa [rsp + _XMM_SAVE + 6*16],xmm12
1366 %endif
1367 
1368  shl NUM_BLKS, 6 ; convert to bytes
1369  jz done_hash
1370  add NUM_BLKS, INP ; pointer to end of data
1371  mov [rsp + _INP_END], NUM_BLKS
1372 
1373  ;; load initial digest
1374  mov a,[4*0 + CTX]
1375  mov b,[4*1 + CTX]
1376  mov c,[4*2 + CTX]
1377  mov d,[4*3 + CTX]
1378  mov e,[4*4 + CTX]
1379  mov f,[4*5 + CTX]
1380  mov g,[4*6 + CTX]
1381  mov h,[4*7 + CTX]
1382 
1383  movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1384  movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
1385  movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
1386 
1387 loop0:
1388  lea TBL,[K256 wrt rip]
1389 
1390  ;; byte swap first 16 dwords
1391  COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
1392  COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
1393  COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
1394  COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
1395 
1396  mov [rsp + _INP], INP
1397 
1398  ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1399  mov SRND, 3
1400 align 16
1401 loop1:
1402  movdqa XFER, [TBL + 0*16]
1403  paddd XFER, X0
1404  movdqa [rsp + _XFER], XFER
1405  FOUR_ROUNDS_AND_SCHED
1406 
1407  movdqa XFER, [TBL + 1*16]
1408  paddd XFER, X0
1409  movdqa [rsp + _XFER], XFER
1410  FOUR_ROUNDS_AND_SCHED
1411 
1412  movdqa XFER, [TBL + 2*16]
1413  paddd XFER, X0
1414  movdqa [rsp + _XFER], XFER
1415  FOUR_ROUNDS_AND_SCHED
1416 
1417  movdqa XFER, [TBL + 3*16]
1418  paddd XFER, X0
1419  movdqa [rsp + _XFER], XFER
1420  add TBL, 4*16
1421  FOUR_ROUNDS_AND_SCHED
1422 
1423  sub SRND, 1
1424  jne loop1
1425 
1426  mov SRND, 2
1427 loop2:
1428  paddd X0, [TBL + 0*16]
1429  movdqa [rsp + _XFER], X0
1430  DO_ROUND 0
1431  DO_ROUND 1
1432  DO_ROUND 2
1433  DO_ROUND 3
1434  paddd X1, [TBL + 1*16]
1435  movdqa [rsp + _XFER], X1
1436  add TBL, 2*16
1437  DO_ROUND 0
1438  DO_ROUND 1
1439  DO_ROUND 2
1440  DO_ROUND 3
1441 
1442  movdqa X0, X2
1443  movdqa X1, X3
1444 
1445  sub SRND, 1
1446  jne loop2
1447 
1448  addm [4*0 + CTX],a
1449  addm [4*1 + CTX],b
1450  addm [4*2 + CTX],c
1451  addm [4*3 + CTX],d
1452  addm [4*4 + CTX],e
1453  addm [4*5 + CTX],f
1454  addm [4*6 + CTX],g
1455  addm [4*7 + CTX],h
1456 
1457  mov INP, [rsp + _INP]
1458  add INP, 64
1459  cmp INP, [rsp + _INP_END]
1460  jne loop0
1461 
1462 done_hash:
1463 %ifndef LINUX
1464  movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
1465  movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
1466  movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
1467  movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
1468  movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
1469  movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
1470  movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
1471 %endif
1472 
1473  add rsp, STACK_SIZE
1474 
1475  pop r15
1476  pop r14
1477  pop r13
1478  pop rbp
1479 %ifndef LINUX
1480  pop rdi
1481  pop rsi
1482 %endif
1483  pop rbx
1484 
1485  ret
1486 
1487 
1488 section .data
1489 align 64
1490 K256:
1491  dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1492  dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1493  dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1494  dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1495  dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1496  dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1497  dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1498  dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1499  dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1500  dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1501  dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1502  dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1503  dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1504  dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1505  dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1506  dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1507 
1508 PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1509 
1510 ; shuffle xBxA -> 00BA
1511 _SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1512 
1513 ; shuffle xDxC -> DC00
1514 _SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
1515 */
1516 
1517 #endif
void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks)