Monero
blamka-round-avx2.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2018-2019, tevador <tevador@gmail.com>
3 
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8  * Redistributions of source code must retain the above copyright
9  notice, this list of conditions and the following disclaimer.
10  * Redistributions in binary form must reproduce the above copyright
11  notice, this list of conditions and the following disclaimer in the
12  documentation and/or other materials provided with the distribution.
13  * Neither the name of the copyright holder nor the
14  names of its contributors may be used to endorse or promote products
15  derived from this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28 
29 /* Original code from Argon2 reference source code package used under CC0 Licence
30  * https://github.com/P-H-C/phc-winner-argon2
31  * Copyright 2015
32  * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
33 */
34 
35 #ifndef BLAKE_ROUND_MKA_OPT_H
36 #define BLAKE_ROUND_MKA_OPT_H
37 
38 #include "blake2-impl.h"
39 
40 #ifdef __GNUC__
41 #include <x86intrin.h>
42 #else
43 #include <intrin.h>
44 #endif
45 
46 #define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
47 #define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
48 #define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
49 #define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
50 
51 #define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
52  do { \
53  __m256i ml = _mm256_mul_epu32(A0, B0); \
54  ml = _mm256_add_epi64(ml, ml); \
55  A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
56  D0 = _mm256_xor_si256(D0, A0); \
57  D0 = rotr32(D0); \
58  \
59  ml = _mm256_mul_epu32(C0, D0); \
60  ml = _mm256_add_epi64(ml, ml); \
61  C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
62  \
63  B0 = _mm256_xor_si256(B0, C0); \
64  B0 = rotr24(B0); \
65  \
66  ml = _mm256_mul_epu32(A1, B1); \
67  ml = _mm256_add_epi64(ml, ml); \
68  A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
69  D1 = _mm256_xor_si256(D1, A1); \
70  D1 = rotr32(D1); \
71  \
72  ml = _mm256_mul_epu32(C1, D1); \
73  ml = _mm256_add_epi64(ml, ml); \
74  C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
75  \
76  B1 = _mm256_xor_si256(B1, C1); \
77  B1 = rotr24(B1); \
78  } while((void)0, 0);
79 
80 #define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
81  do { \
82  __m256i ml = _mm256_mul_epu32(A0, B0); \
83  ml = _mm256_add_epi64(ml, ml); \
84  A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
85  D0 = _mm256_xor_si256(D0, A0); \
86  D0 = rotr16(D0); \
87  \
88  ml = _mm256_mul_epu32(C0, D0); \
89  ml = _mm256_add_epi64(ml, ml); \
90  C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
91  B0 = _mm256_xor_si256(B0, C0); \
92  B0 = rotr63(B0); \
93  \
94  ml = _mm256_mul_epu32(A1, B1); \
95  ml = _mm256_add_epi64(ml, ml); \
96  A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
97  D1 = _mm256_xor_si256(D1, A1); \
98  D1 = rotr16(D1); \
99  \
100  ml = _mm256_mul_epu32(C1, D1); \
101  ml = _mm256_add_epi64(ml, ml); \
102  C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
103  B1 = _mm256_xor_si256(B1, C1); \
104  B1 = rotr63(B1); \
105  } while((void)0, 0);
106 
107 #define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
108  do { \
109  B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
110  C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
111  D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
112  \
113  B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
114  C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
115  D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
116  } while((void)0, 0);
117 
118 #define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
119  do { \
120  __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
121  __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
122  B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
123  B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
124  \
125  tmp1 = C0; \
126  C0 = C1; \
127  C1 = tmp1; \
128  \
129  tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
130  tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
131  D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
132  D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
133  } while(0);
134 
135 #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
136  do { \
137  B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
138  C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
139  D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
140  \
141  B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
142  C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
143  D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
144  } while((void)0, 0);
145 
146 #define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
147  do { \
148  __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
149  __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
150  B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
151  B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
152  \
153  tmp1 = C0; \
154  C0 = C1; \
155  C1 = tmp1; \
156  \
157  tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
158  tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
159  D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
160  D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
161  } while((void)0, 0);
162 
163 #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
164  do{ \
165  G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
166  G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
167  \
168  DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
169  \
170  G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
171  G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
172  \
173  UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
174  } while((void)0, 0);
175 
176 #define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
177  do{ \
178  G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
179  G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
180  \
181  DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
182  \
183  G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
184  G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
185  \
186  UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
187  } while((void)0, 0);
188 
189 #endif /* BLAKE_ROUND_MKA_OPT_H */