Monero
Loading...
Searching...
No Matches
blamka-round-avx2.h
Go to the documentation of this file.
1/*
2Copyright (c) 2018-2019, tevador <tevador@gmail.com>
3
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of the copyright holder nor the
14 names of its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16
17THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*/
28
29/* Original code from Argon2 reference source code package used under CC0 Licence
30 * https://github.com/P-H-C/phc-winner-argon2
31 * Copyright 2015
32 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
33*/
34
35#ifndef BLAKE_ROUND_MKA_OPT_H
36#define BLAKE_ROUND_MKA_OPT_H
37
38#include "blake2-impl.h"
39
40#ifdef __GNUC__
41#include <x86intrin.h>
42#else
43#include <intrin.h>
44#endif
45
46#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
47#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
48#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
49#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
50
51#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
52 do { \
53 __m256i ml = _mm256_mul_epu32(A0, B0); \
54 ml = _mm256_add_epi64(ml, ml); \
55 A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
56 D0 = _mm256_xor_si256(D0, A0); \
57 D0 = rotr32(D0); \
58 \
59 ml = _mm256_mul_epu32(C0, D0); \
60 ml = _mm256_add_epi64(ml, ml); \
61 C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
62 \
63 B0 = _mm256_xor_si256(B0, C0); \
64 B0 = rotr24(B0); \
65 \
66 ml = _mm256_mul_epu32(A1, B1); \
67 ml = _mm256_add_epi64(ml, ml); \
68 A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
69 D1 = _mm256_xor_si256(D1, A1); \
70 D1 = rotr32(D1); \
71 \
72 ml = _mm256_mul_epu32(C1, D1); \
73 ml = _mm256_add_epi64(ml, ml); \
74 C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
75 \
76 B1 = _mm256_xor_si256(B1, C1); \
77 B1 = rotr24(B1); \
78 } while((void)0, 0);
79
80#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
81 do { \
82 __m256i ml = _mm256_mul_epu32(A0, B0); \
83 ml = _mm256_add_epi64(ml, ml); \
84 A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
85 D0 = _mm256_xor_si256(D0, A0); \
86 D0 = rotr16(D0); \
87 \
88 ml = _mm256_mul_epu32(C0, D0); \
89 ml = _mm256_add_epi64(ml, ml); \
90 C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
91 B0 = _mm256_xor_si256(B0, C0); \
92 B0 = rotr63(B0); \
93 \
94 ml = _mm256_mul_epu32(A1, B1); \
95 ml = _mm256_add_epi64(ml, ml); \
96 A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
97 D1 = _mm256_xor_si256(D1, A1); \
98 D1 = rotr16(D1); \
99 \
100 ml = _mm256_mul_epu32(C1, D1); \
101 ml = _mm256_add_epi64(ml, ml); \
102 C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
103 B1 = _mm256_xor_si256(B1, C1); \
104 B1 = rotr63(B1); \
105 } while((void)0, 0);
106
107#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
108 do { \
109 B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
110 C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
111 D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
112 \
113 B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
114 C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
115 D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
116 } while((void)0, 0);
117
118#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
119 do { \
120 __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
121 __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
122 B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
123 B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
124 \
125 tmp1 = C0; \
126 C0 = C1; \
127 C1 = tmp1; \
128 \
129 tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
130 tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
131 D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
132 D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
133 } while(0);
134
135#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
136 do { \
137 B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
138 C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
139 D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
140 \
141 B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
142 C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
143 D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
144 } while((void)0, 0);
145
146#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
147 do { \
148 __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
149 __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
150 B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
151 B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
152 \
153 tmp1 = C0; \
154 C0 = C1; \
155 C1 = tmp1; \
156 \
157 tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
158 tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
159 D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
160 D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
161 } while((void)0, 0);
162
163#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
164 do{ \
165 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
166 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
167 \
168 DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
169 \
170 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
171 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
172 \
173 UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
174 } while((void)0, 0);
175
176#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
177 do{ \
178 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
179 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
180 \
181 DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
182 \
183 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
184 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
185 \
186 UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
187 } while((void)0, 0);
188
189#endif /* BLAKE_ROUND_MKA_OPT_H */