11 #if defined(__x86_64__) || defined(__amd64__) 15 void Transform(uint32_t*
s,
const unsigned char* chunk,
size_t blocks)
16 #if defined(__clang__) 23 #if __has_feature(address_sanitizer) 24 __attribute__((no_sanitize(
"address")))
28 static const uint32_t K256
alignas(16) [] = {
29 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
30 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
31 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
32 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
33 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
34 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
35 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
36 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
37 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
38 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
39 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
40 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
41 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
42 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
43 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
44 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
46 static const uint32_t FLIP_MASK
alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
47 static const uint32_t SHUF_00BA
alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
48 static const uint32_t SHUF_DC00
alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
49 uint32_t a, b, c, d, f,
g, h, y0, y1, y2;
51 uint64_t inp_end, inp;
52 uint32_t xfer
alignas(16) [4];
74 "pshufb %%xmm12,%%xmm4;" 75 "movdqu 0x10(%1),%%xmm5;" 76 "pshufb %%xmm12,%%xmm5;" 77 "movdqu 0x20(%1),%%xmm6;" 78 "pshufb %%xmm12,%%xmm6;" 79 "movdqu 0x30(%1),%%xmm7;" 80 "pshufb %%xmm12,%%xmm7;" 85 "movdqa 0x0(%13),%%xmm9;" 86 "paddd %%xmm4,%%xmm9;" 88 "movdqa %%xmm7,%%xmm0;" 92 "palignr $0x4,%%xmm6,%%xmm0;" 97 "movdqa %%xmm5,%%xmm1;" 100 "paddd %%xmm4,%%xmm0;" 104 "palignr $0x4,%%xmm4,%%xmm1;" 108 "movdqa %%xmm1,%%xmm2;" 112 "movdqa %%xmm1,%%xmm3;" 116 "pslld $0x19,%%xmm1;" 126 "movdqa %%xmm3,%%xmm2;" 129 "movdqa %%xmm3,%%xmm8;" 138 "psrld $0x12,%%xmm2;" 143 "pxor %%xmm3,%%xmm1;" 150 "pxor %%xmm2,%%xmm1;" 154 "pxor %%xmm8,%%xmm1;" 158 "pshufd $0xfa,%%xmm7,%%xmm2;" 161 "paddd %%xmm1,%%xmm0;" 164 "movdqa %%xmm2,%%xmm3;" 168 "movdqa %%xmm2,%%xmm8;" 174 "psrlq $0x11,%%xmm2;" 176 "psrlq $0x13,%%xmm3;" 184 "pxor %%xmm3,%%xmm2;" 188 "pxor %%xmm2,%%xmm8;" 192 "pshufb %%xmm10,%%xmm8;" 196 "paddd %%xmm8,%%xmm0;" 199 "pshufd $0x50,%%xmm0,%%xmm2;" 202 "movdqa %%xmm2,%%xmm3;" 206 "movdqa %%xmm2,%%xmm4;" 211 "psrlq $0x11,%%xmm2;" 214 "psrlq $0x13,%%xmm3;" 222 "pxor %%xmm3,%%xmm2;" 226 "pxor %%xmm2,%%xmm4;" 230 "pshufb %%xmm11,%%xmm4;" 234 "paddd %%xmm0,%%xmm4;" 239 "movdqa 0x10(%13),%%xmm9;" 240 "paddd %%xmm5,%%xmm9;" 242 "movdqa %%xmm4,%%xmm0;" 246 "palignr $0x4,%%xmm7,%%xmm0;" 251 "movdqa %%xmm6,%%xmm1;" 254 "paddd %%xmm5,%%xmm0;" 258 "palignr $0x4,%%xmm5,%%xmm1;" 262 "movdqa %%xmm1,%%xmm2;" 266 "movdqa %%xmm1,%%xmm3;" 270 "pslld $0x19,%%xmm1;" 280 "movdqa %%xmm3,%%xmm2;" 283 "movdqa %%xmm3,%%xmm8;" 292 "psrld $0x12,%%xmm2;" 297 "pxor %%xmm3,%%xmm1;" 304 "pxor %%xmm2,%%xmm1;" 308 "pxor %%xmm8,%%xmm1;" 312 "pshufd $0xfa,%%xmm4,%%xmm2;" 315 "paddd %%xmm1,%%xmm0;" 318 "movdqa %%xmm2,%%xmm3;" 322 "movdqa %%xmm2,%%xmm8;" 328 "psrlq $0x11,%%xmm2;" 330 "psrlq $0x13,%%xmm3;" 338 "pxor %%xmm3,%%xmm2;" 342 "pxor %%xmm2,%%xmm8;" 346 "pshufb %%xmm10,%%xmm8;" 350 "paddd %%xmm8,%%xmm0;" 353 "pshufd $0x50,%%xmm0,%%xmm2;" 356 "movdqa %%xmm2,%%xmm3;" 360 "movdqa %%xmm2,%%xmm5;" 365 "psrlq $0x11,%%xmm2;" 368 "psrlq $0x13,%%xmm3;" 376 "pxor %%xmm3,%%xmm2;" 380 "pxor %%xmm2,%%xmm5;" 384 "pshufb %%xmm11,%%xmm5;" 388 "paddd %%xmm0,%%xmm5;" 393 "movdqa 0x20(%13),%%xmm9;" 394 "paddd %%xmm6,%%xmm9;" 396 "movdqa %%xmm5,%%xmm0;" 400 "palignr $0x4,%%xmm4,%%xmm0;" 405 "movdqa %%xmm7,%%xmm1;" 408 "paddd %%xmm6,%%xmm0;" 412 "palignr $0x4,%%xmm6,%%xmm1;" 416 "movdqa %%xmm1,%%xmm2;" 420 "movdqa %%xmm1,%%xmm3;" 424 "pslld $0x19,%%xmm1;" 434 "movdqa %%xmm3,%%xmm2;" 437 "movdqa %%xmm3,%%xmm8;" 446 "psrld $0x12,%%xmm2;" 451 "pxor %%xmm3,%%xmm1;" 458 "pxor %%xmm2,%%xmm1;" 462 "pxor %%xmm8,%%xmm1;" 466 "pshufd $0xfa,%%xmm5,%%xmm2;" 469 "paddd %%xmm1,%%xmm0;" 472 "movdqa %%xmm2,%%xmm3;" 476 "movdqa %%xmm2,%%xmm8;" 482 "psrlq $0x11,%%xmm2;" 484 "psrlq $0x13,%%xmm3;" 492 "pxor %%xmm3,%%xmm2;" 496 "pxor %%xmm2,%%xmm8;" 500 "pshufb %%xmm10,%%xmm8;" 504 "paddd %%xmm8,%%xmm0;" 507 "pshufd $0x50,%%xmm0,%%xmm2;" 510 "movdqa %%xmm2,%%xmm3;" 514 "movdqa %%xmm2,%%xmm6;" 519 "psrlq $0x11,%%xmm2;" 522 "psrlq $0x13,%%xmm3;" 530 "pxor %%xmm3,%%xmm2;" 534 "pxor %%xmm2,%%xmm6;" 538 "pshufb %%xmm11,%%xmm6;" 542 "paddd %%xmm0,%%xmm6;" 547 "movdqa 0x30(%13),%%xmm9;" 548 "paddd %%xmm7,%%xmm9;" 551 "movdqa %%xmm6,%%xmm0;" 555 "palignr $0x4,%%xmm5,%%xmm0;" 560 "movdqa %%xmm4,%%xmm1;" 563 "paddd %%xmm7,%%xmm0;" 567 "palignr $0x4,%%xmm7,%%xmm1;" 571 "movdqa %%xmm1,%%xmm2;" 575 "movdqa %%xmm1,%%xmm3;" 579 "pslld $0x19,%%xmm1;" 589 "movdqa %%xmm3,%%xmm2;" 592 "movdqa %%xmm3,%%xmm8;" 601 "psrld $0x12,%%xmm2;" 606 "pxor %%xmm3,%%xmm1;" 613 "pxor %%xmm2,%%xmm1;" 617 "pxor %%xmm8,%%xmm1;" 621 "pshufd $0xfa,%%xmm6,%%xmm2;" 624 "paddd %%xmm1,%%xmm0;" 627 "movdqa %%xmm2,%%xmm3;" 631 "movdqa %%xmm2,%%xmm8;" 637 "psrlq $0x11,%%xmm2;" 639 "psrlq $0x13,%%xmm3;" 647 "pxor %%xmm3,%%xmm2;" 651 "pxor %%xmm2,%%xmm8;" 655 "pshufb %%xmm10,%%xmm8;" 659 "paddd %%xmm8,%%xmm0;" 662 "pshufd $0x50,%%xmm0,%%xmm2;" 665 "movdqa %%xmm2,%%xmm3;" 669 "movdqa %%xmm2,%%xmm7;" 674 "psrlq $0x11,%%xmm2;" 677 "psrlq $0x13,%%xmm3;" 685 "pxor %%xmm3,%%xmm2;" 689 "pxor %%xmm2,%%xmm7;" 693 "pshufb %%xmm11,%%xmm7;" 697 "paddd %%xmm0,%%xmm7;" 707 "paddd 0x0(%13),%%xmm4;" 821 "paddd 0x10(%13),%%xmm5;" 936 "movdqa %%xmm6,%%xmm4;" 937 "movdqa %%xmm7,%%xmm5;" 963 :
"+r"(
s),
"+r"(chunk),
"+r"(blocks),
"=r"(a),
"=r"(b),
"=r"(c),
"=r"(d),
"=r"(f),
"=r"(
g),
"=r"(h),
"=r"(y0),
"=r"(y1),
"=r"(y2),
"=r"(tbl),
"+m"(inp_end),
"+m"(inp),
"+m"(xfer)
964 :
"m"(K256),
"m"(FLIP_MASK),
"m"(SHUF_00BA),
"m"(SHUF_DC00)
965 :
"cc",
"memory",
"xmm0",
"xmm1",
"xmm2",
"xmm3",
"xmm4",
"xmm5",
"xmm6",
"xmm7",
"xmm8",
"xmm9",
"xmm10",
"xmm11",
"xmm12" void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks)