Go to the documentation of this file. 35 #ifndef BLAKE_ROUND_MKA_OPT_H 36 #define BLAKE_ROUND_MKA_OPT_H 41 #include <x86intrin.h> 46 #define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)) 47 #define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) 48 #define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) 49 #define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x))) 51 #define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 53 __m256i ml = _mm256_mul_epu32(A0, B0); \ 54 ml = _mm256_add_epi64(ml, ml); \ 55 A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ 56 D0 = _mm256_xor_si256(D0, A0); \ 59 ml = _mm256_mul_epu32(C0, D0); \ 60 ml = _mm256_add_epi64(ml, ml); \ 61 C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ 63 B0 = _mm256_xor_si256(B0, C0); \ 66 ml = _mm256_mul_epu32(A1, B1); \ 67 ml = _mm256_add_epi64(ml, ml); \ 68 A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ 69 D1 = _mm256_xor_si256(D1, A1); \ 72 ml = _mm256_mul_epu32(C1, D1); \ 73 ml = _mm256_add_epi64(ml, ml); \ 74 C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ 76 B1 = _mm256_xor_si256(B1, C1); \ 80 #define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 82 __m256i ml = _mm256_mul_epu32(A0, B0); \ 83 ml = _mm256_add_epi64(ml, ml); \ 84 A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ 85 D0 = _mm256_xor_si256(D0, A0); \ 88 ml = _mm256_mul_epu32(C0, D0); \ 89 ml = _mm256_add_epi64(ml, ml); \ 90 C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ 91 B0 = _mm256_xor_si256(B0, C0); \ 94 ml = _mm256_mul_epu32(A1, B1); \ 95 ml = _mm256_add_epi64(ml, ml); \ 96 A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ 97 D1 = _mm256_xor_si256(D1, A1); \ 100 ml = _mm256_mul_epu32(C1, D1); \ 101 ml = _mm256_add_epi64(ml, ml); \ 102 C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ 103 B1 = _mm256_xor_si256(B1, C1); \ 107 #define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ 109 B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ 110 C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ 111 D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ 113 B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ 114 C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ 115 D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ 118 #define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ 120 __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ 121 __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ 122 B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ 123 B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ 129 tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \ 130 tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \ 131 D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ 132 D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ 135 #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ 137 B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ 138 C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ 139 D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ 141 B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ 142 C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ 143 D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ 146 #define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ 148 __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ 149 __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ 150 B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ 151 B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ 157 tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \ 158 tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \ 159 D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ 160 D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ 163 #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \ 165 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 166 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 168 DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ 170 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 171 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 173 UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ 176 #define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \ 178 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 179 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 181 DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ 183 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 184 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 186 UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \