Loading...
Searching...
No Matches
Go to the documentation of this file.
35#ifndef BLAKE_ROUND_MKA_OPT_H
36#define BLAKE_ROUND_MKA_OPT_H
46#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
47#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
48#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
49#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
51#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
53 __m256i ml = _mm256_mul_epu32(A0, B0); \
54 ml = _mm256_add_epi64(ml, ml); \
55 A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
56 D0 = _mm256_xor_si256(D0, A0); \
59 ml = _mm256_mul_epu32(C0, D0); \
60 ml = _mm256_add_epi64(ml, ml); \
61 C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
63 B0 = _mm256_xor_si256(B0, C0); \
66 ml = _mm256_mul_epu32(A1, B1); \
67 ml = _mm256_add_epi64(ml, ml); \
68 A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
69 D1 = _mm256_xor_si256(D1, A1); \
72 ml = _mm256_mul_epu32(C1, D1); \
73 ml = _mm256_add_epi64(ml, ml); \
74 C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
76 B1 = _mm256_xor_si256(B1, C1); \
80#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
82 __m256i ml = _mm256_mul_epu32(A0, B0); \
83 ml = _mm256_add_epi64(ml, ml); \
84 A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
85 D0 = _mm256_xor_si256(D0, A0); \
88 ml = _mm256_mul_epu32(C0, D0); \
89 ml = _mm256_add_epi64(ml, ml); \
90 C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
91 B0 = _mm256_xor_si256(B0, C0); \
94 ml = _mm256_mul_epu32(A1, B1); \
95 ml = _mm256_add_epi64(ml, ml); \
96 A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
97 D1 = _mm256_xor_si256(D1, A1); \
100 ml = _mm256_mul_epu32(C1, D1); \
101 ml = _mm256_add_epi64(ml, ml); \
102 C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
103 B1 = _mm256_xor_si256(B1, C1); \
107#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
109 B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
110 C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
111 D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
113 B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
114 C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
115 D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
118#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
120 __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
121 __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
122 B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
123 B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
129 tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
130 tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
131 D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
132 D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
135#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
137 B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
138 C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
139 D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
141 B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
142 C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
143 D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
146#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
148 __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
149 __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
150 B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
151 B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
157 tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
158 tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
159 D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
160 D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
163#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
165 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
166 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
168 DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
170 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
171 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
173 UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
176#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
178 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
179 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
181 DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
183 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
184 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
186 UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \