52 #if !defined(__SSE2__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)) 57 #if defined(_MSC_VER) && defined(__SSE2__) 63 #if !defined(__SSE2__) 64 #if defined(_MSC_VER) && defined(_M_IX86) 65 inline double __cdecl
rx_sqrt(
double x) {
71 #define rx_sqrt rx_sqrt 74 #define RANDOMX_USE_X87 79 #define RANDOMX_USE_X87 88 #if !defined(RANDOMX_USE_X87) 89 #define rx_set_double_precision(x) 94 #include <x86intrin.h> 102 #define rx_aligned_alloc(a, b) _mm_malloc(a,b) 103 #define rx_aligned_free(a) _mm_free(a) 104 #define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) 105 #define rx_prefetch_t0(x) _mm_prefetch((const char *)(x), _MM_HINT_T0) 107 #define rx_load_vec_f128 _mm_load_pd 108 #define rx_store_vec_f128 _mm_store_pd 109 #define rx_add_vec_f128 _mm_add_pd 110 #define rx_sub_vec_f128 _mm_sub_pd 111 #define rx_mul_vec_f128 _mm_mul_pd 112 #define rx_div_vec_f128 _mm_div_pd 113 #define rx_sqrt_vec_f128 _mm_sqrt_pd 116 return _mm_shuffle_pd(
a,
a, 1);
120 return _mm_castsi128_pd(_mm_set_epi64x(x1, x0));
124 return _mm_castsi128_pd(_mm_set1_epi64x(x));
127 #define rx_xor_vec_f128 _mm_xor_pd 128 #define rx_and_vec_f128 _mm_and_pd 129 #define rx_or_vec_f128 _mm_or_pd 133 #define rx_aesenc_vec_i128 _mm_aesenc_si128 134 #define rx_aesdec_vec_i128 _mm_aesdec_si128 141 return _mm_cvtsi128_si32(
a);
145 return _mm_cvtsi128_si32(_mm_shuffle_epi32(
a, 0x55));
149 return _mm_cvtsi128_si32(_mm_shuffle_epi32(
a, 0xaa));
153 return _mm_cvtsi128_si32(_mm_shuffle_epi32(
a, 0xff));
156 #define rx_set_int_vec_i128 _mm_set_epi32 157 #define rx_xor_vec_i128 _mm_xor_si128 158 #define rx_load_vec_i128 _mm_load_si128 159 #define rx_store_vec_i128 _mm_store_si128 162 __m128i ix = _mm_loadl_epi64((
const __m128i*)addr);
163 return _mm_cvtepi32_pd(ix);
166 constexpr
uint32_t rx_mxcsr_default = 0x9FC0;
169 _mm_setcsr(rx_mxcsr_default);
173 _mm_setcsr(rx_mxcsr_default | (
mode << 13));
177 return (_mm_getcsr() >> 13) & 3;
180 #elif defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__) //sadly only POWER7 and newer will be able to use SIMD acceleration. Earlier processors cant use doubles or 64 bit integers with SIMD 189 typedef __vector
uint8_t __m128i;
191 typedef __vector
int __m128li;
193 typedef __vector
double __m128d;
206 #define rx_aligned_alloc(a, b) malloc(a) 207 #define rx_aligned_free(a) free(a) 208 #define rx_prefetch_nta(x) 209 #define rx_prefetch_t0(x) 213 {
return (__m128i) vec_splats (scalar); }
216 #if defined(NATIVE_LITTLE_ENDIAN) 227 #if defined(NATIVE_LITTLE_ENDIAN) 232 store64(mem_addr + 0, _a.u64[0]);
233 store64(mem_addr + 1, _a.u64[1]);
238 return (
rx_vec_f128)vec_perm((__m128i)
a,(__m128i)
a,(__m128i){8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7});
289 #if defined(__CRYPTO__) 292 #if defined(NATIVE_LITTLE_ENDIAN) 293 return (__m128ll)vec_perm((__m128i)v,(__m128i){0},(__m128i){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0});
295 return (__m128ll)vec_perm((__m128i)v,(__m128i){0},(__m128i){3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12});
300 __m128ll _v = vrev(v);
301 __m128ll _rkey = vrev(rkey);
302 __m128ll
result = vrev((__m128i)__builtin_crypto_vcipher(_v,_rkey));
307 __m128ll _v = vrev(v);
308 __m128ll
zero = (__m128ll){0};
309 __m128ll
out = vrev((__m128i)__builtin_crypto_vncipher(_v,
zero));
349 #if defined(NATIVE_LITTLE_ENDIAN) 363 #if defined(NATIVE_LITTLE_ENDIAN) 383 #define RANDOMX_DEFAULT_FENV 385 #elif defined(__aarch64__) 388 #include <arm_neon.h> 389 #include <arm_acle.h> 396 if (posix_memalign(&
p, align, size) == 0)
402 #define rx_aligned_free(a) free(a) 405 asm volatile (
"prfm pldl1strm, [%0]\n" : :
"r" (ptr));
409 asm volatile (
"prfm pldl1strm, [%0]\n" : :
"r" (ptr));
413 return vld1q_f64((
const float64_t*)pd);
417 vst1q_f64((float64_t*)mem_addr, val);
422 temp = vcopyq_laneq_f64(temp, 1,
a, 1);
423 a = vcopyq_laneq_f64(
a, 1,
a, 0);
424 return vcopyq_laneq_f64(
a, 0, temp, 1);
428 uint64x2_t temp0 = vdupq_n_u64(x0);
429 uint64x2_t temp1 = vdupq_n_u64(x1);
430 return vreinterpretq_f64_u64(vcopyq_laneq_u64(temp0, 1, temp1, 0));
434 return vreinterpretq_f64_u64(vdupq_n_u64(x));
437 #define rx_add_vec_f128 vaddq_f64 438 #define rx_sub_vec_f128 vsubq_f64 439 #define rx_mul_vec_f128 vmulq_f64 440 #define rx_div_vec_f128 vdivq_f64 441 #define rx_sqrt_vec_f128 vsqrtq_f64 444 return vreinterpretq_f64_u8(veorq_u8(vreinterpretq_u8_f64(
a), vreinterpretq_u8_f64(
b)));
448 return vreinterpretq_f64_u8(vandq_u8(vreinterpretq_u8_f64(
a), vreinterpretq_u8_f64(
b)));
452 return vreinterpretq_f64_u8(vorrq_u8(vreinterpretq_u8_f64(
a), vreinterpretq_u8_f64(
b)));
455 #ifdef __ARM_FEATURE_CRYPTO 459 const uint8x16_t
zero = { 0 };
460 return vaesmcq_u8(vaeseq_u8(
a,
zero)) ^
key;
464 const uint8x16_t
zero = { 0 };
465 return vaesimcq_u8(vaesdq_u8(
a,
zero)) ^
key;
472 #define rx_xor_vec_i128 veorq_u8 475 return vgetq_lane_s32(vreinterpretq_s32_u8(
a), 0);
479 return vgetq_lane_s32(vreinterpretq_s32_u8(
a), 1);
483 return vgetq_lane_s32(vreinterpretq_s32_u8(
a), 2);
487 return vgetq_lane_s32(vreinterpretq_s32_u8(
a), 3);
496 return vreinterpretq_u8_s32(vld1q_s32(
data));
499 #define rx_xor_vec_i128 veorq_u8 502 return vld1q_u8((
const uint8_t*)mem_addr);
506 vst1q_u8((
uint8_t*)mem_addr, val);
513 x = vsetq_lane_f64(lo, x, 0);
514 x = vsetq_lane_f64(hi, x, 1);
518 #define RANDOMX_DEFAULT_FENV 520 #else //portable fallback 542 #define rx_aligned_alloc(a, b) malloc(a) 543 #define rx_aligned_free(a) free(a) 544 #define rx_prefetch_nta(x) 545 #define rx_prefetch_t0(x) 630 x.
i.
u64[0] =
a.i.u64[0] ^
b.i.u64[0];
631 x.
i.
u64[1] =
a.i.u64[1] ^
b.i.u64[1];
637 x.
i.
u64[0] =
a.i.u64[0] &
b.i.u64[0];
638 x.
i.
u64[1] =
a.i.u64[1] &
b.i.u64[1];
644 x.
i.
u64[0] =
a.i.u64[0] |
b.i.u64[0];
645 x.
i.
u64[1] =
a.i.u64[1] |
b.i.u64[1];
676 c.u32[0] =
a.u32[0] ^
b.u32[0];
677 c.u32[1] =
a.u32[1] ^
b.u32[1];
678 c.u32[2] =
a.u32[2] ^
b.u32[2];
679 c.u32[3] =
a.u32[3] ^
b.u32[3];
684 #if defined(NATIVE_LITTLE_ENDIAN) 698 #if defined(NATIVE_LITTLE_ENDIAN) 716 #define RANDOMX_DEFAULT_FENV 737 #ifdef RANDOMX_DEFAULT_FENV static const char * platformError
Definition: intrin_portable.h:721
#define INT32_MAX
Definition: stdint.h:183
FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a)
Definition: intrin_portable.h:661
#define FORCE_INLINE
Definition: endian.h:10
Definition: intrin_portable.h:527
constexpr int RoundToZero
Definition: intrin_portable.h:49
int64_t smulh(int64_t, int64_t)
Definition: instructions_portable.cpp:125
FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 a, rx_vec_i128 b)
Definition: intrin_portable.h:674
int i
Definition: pymoduletest.py:23
#define INT64_MAX
Definition: stdint.h:185
uint64_t u64[2]
Definition: intrin_portable.h:528
constexpr int RoundToNearest
Definition: intrin_portable.h:46
t
Definition: console.py:33
Definition: intrin_portable.h:534
std::string data
Definition: base58.cpp:37
const char * key
Definition: hmac_keccak.cpp:40
uint64_t rotr(uint64_t, unsigned int)
Definition: instructions_portable.cpp:92
unsigned short uint16_t
Definition: stdint.h:125
uint64_t rotl(uint64_t, unsigned int)
Definition: instructions_portable.cpp:99
uint64_t mulh(uint64_t, uint64_t)
Definition: instructions_portable.cpp:108
FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0)
Definition: intrin_portable.h:614
FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b)
Definition: intrin_portable.h:635
#define rx_set_double_precision(x)
Definition: intrin_portable.h:89
FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int i3, int i2, int i1, int i0)
Definition: intrin_portable.h:665
tools::wallet2::message_signature_result_t result
Definition: signature.cpp:62
unsigned char uint8_t
Definition: stdint.h:124
uint32_t rx_get_rounding_mode()
Definition: instructions_portable.cpp:160
FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b)
Definition: intrin_portable.h:642
static FORCE_INLINE uint64_t load64(const void *src)
Definition: endian.h:50
constexpr int64_t unsigned64ToSigned2sCompl(uint64_t x)
Definition: intrin_portable.h:38
FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b)
Definition: intrin_portable.h:566
#define rx_aligned_alloc(a, b)
Definition: intrin_portable.h:542
#define rx_sqrt
Definition: intrin_portable.h:85
static const unsigned char zero[32]
Definition: fe_isnonzero.c:12
void rx_reset_float_state()
Definition: instructions_portable.cpp:136
constexpr uint32_t B
Definition: jit_compiler_a64.cpp:38
FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const *p)
Definition: intrin_portable.h:683
unsigned int uint32_t
Definition: stdint.h:126
FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a)
Definition: intrin_portable.h:649
constexpr int RoundUp
Definition: intrin_portable.h:48
FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x)
Definition: intrin_portable.h:621
unsigned __int64 uint64_t
Definition: stdint.h:136
FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b)
Definition: intrin_portable.h:628
void rx_set_rounding_mode(uint32_t mode)
Definition: instructions_portable.cpp:141
FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a)
Definition: intrin_portable.h:559
#define UINT64_MAX
Definition: stdint.h:189
enum modes mode
Definition: minihttptestserver.c:268
FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b)
Definition: intrin_portable.h:580
FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey)
Definition: intrin_portable.h:729
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition: pointer.h:1124
double hi
Definition: intrin_portable.h:537
#define UINT32_MAX
Definition: stdint.h:188
FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b)
Definition: intrin_portable.h:587
FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double *pd)
Definition: intrin_portable.h:547
FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a)
Definition: intrin_portable.h:653
FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey)
Definition: intrin_portable.h:725
static FORCE_INLINE void store32(void *dst, uint32_t w)
Definition: endian.h:67
constexpr uint64_t signExtend2sCompl(uint32_t x)
Definition: intrin_portable.h:42
signed __int64 int64_t
Definition: stdint.h:135
FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a)
Definition: intrin_portable.h:601
static FORCE_INLINE void store64(void *dst, uint64_t w)
Definition: endian.h:86
d
Definition: pymoduletest.py:79
p
Definition: pymoduletest.py:75
FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *p, rx_vec_i128 b)
Definition: intrin_portable.h:697
rx_vec_i128 i
Definition: intrin_portable.h:539
#define rx_prefetch_t0(x)
Definition: intrin_portable.h:545
double loadDoublePortable(const void *addr)
Definition: instructions_portable.cpp:204
signed int int32_t
Definition: stdint.h:123
FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a)
Definition: intrin_portable.h:594
FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a)
Definition: intrin_portable.h:657
double lo
Definition: intrin_portable.h:536
FORCE_INLINE void rx_store_vec_f128(double *mem_addr, rx_vec_f128 a)
Definition: intrin_portable.h:554
static FORCE_INLINE uint32_t load32(const void *src)
Definition: endian.h:29
#define rx_prefetch_nta(x)
Definition: intrin_portable.h:544
FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b)
Definition: intrin_portable.h:573
c
Definition: pymoduletest.py:79
FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a)
Definition: intrin_portable.h:608
cryptonote::block b
Definition: block.cpp:40
FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void *addr)
Definition: intrin_portable.h:709
uint32_t u32[4]
Definition: intrin_portable.h:529
constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x)
Definition: intrin_portable.h:34
constexpr int RoundDown
Definition: intrin_portable.h:47