1 #ifndef OSMSCOUT_SYSTEM_SSEMATH_H 2 #define OSMSCOUT_SYSTEM_SSEMATH_H 23 #include <osmscout/CoreFeatures.h> 31 #include <osmscout/private/Config.h> 35 #define ARRAY2V2DF(name) * reinterpret_cast<const v2df*>(name) 36 #define ARRAY2V2DI(name) * reinterpret_cast<const v2di*>(name) 38 #define DECLARE_COEFFS(name) extern const ALIGN16_BEG double name[] ALIGN16_END; 40 #define POLY_EVAL3SINGLE_HORNER(y, x, coeff) \ 41 y = coeff[0]; y = y*x + coeff[2]; y = y*x + coeff[4]; y = y*x + coeff[6] 43 #define POLY_EVAL3_HORNER(y, x, coeff) \ 44 y = ARRAY2V2DF(&coeff[0]); \ 45 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[2])); \ 46 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[4])); \ 47 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[6])) 49 #define POLY_EVAL6SINGLE_HORNER(y, x, coeff) \ 50 y = coeff[0]; y = y*x + coeff[2]; y = y*x + coeff[4]; y = y*x + coeff[6]; \ 51 y = y*x + coeff[8]; y = y*x + coeff[10]; y = y*x + coeff[12] 53 #define POLY_EVAL6_HORNER(y, x, coeff) \ 54 y = ARRAY2V2DF(&coeff[0]); \ 55 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[2])); \ 56 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[4])); \ 57 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[6])); \ 58 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[8])); \ 59 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[10])); \ 60 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[12])) 62 #define POLY_EVAL6SINGLE_ESTRIN(y, x, coeff) \ 63 y = coeff[12]; y = y*x + coeff[10]; y = y*x + coeff[8]; y = y*x + coeff[6]; \ 64 y = y*x + coeff[4]; y = y*x + coeff[2]; y = y*x + coeff[0]; \ 66 #define POLY_EVAL6_ESTRIN(y, x, coeff) \ 67 v2df pt0 = _mm_add_pd(ARRAY2V2DF(&coeff[0]), _mm_mul_pd(ARRAY2V2DF(&coeff[ 2]), x)); \ 68 v2df pt1 = _mm_add_pd(ARRAY2V2DF(&coeff[4]), _mm_mul_pd(ARRAY2V2DF(&coeff[ 6]), x)); \ 69 v2df pt2 = _mm_add_pd(ARRAY2V2DF(&coeff[8]), _mm_mul_pd(ARRAY2V2DF(&coeff[10]), x)); \ 70 v2df pt3 = ARRAY2V2DF(&coeff[12]); \ 71 v2df ptx2 = _mm_mul_pd(x,x); \ 72 pt0 = _mm_add_pd(pt0, _mm_mul_pd(pt1, ptx2)); \ 73 pt2 = _mm_add_pd(pt2, _mm_mul_pd(pt3, ptx2)); \ 74 v2df ptx4 = _mm_mul_pd(ptx2,ptx2); \ 75 y = _mm_add_pd(pt0, _mm_mul_pd(pt2, ptx4)) 77 #ifdef _SSE_SLOW_ //For some cpu's this is faster 82 #define POLY_EVAL3SINGLE(y, x, coeff) POLY_EVAL3SINGLE_HORNER(y, x, coeff) 83 #define POLY_EVAL3(y, x, coeff) POLY_EVAL3_HORNER(y, x, coeff) 84 #define POLY_EVAL6SINGLE(y, x, coeff) POLY_EVAL6SINGLE_HORNER(y, x, coeff) 85 #define POLY_EVAL6(y, x, coeff) POLY_EVAL6_HORNER(y, x, coeff) 91 #define POLY_EVAL3SINGLE(y, x, coeff) POLY_EVAL3SINGLE_HORNER(y, x, coeff) 92 #define POLY_EVAL3(y, x, coeff) POLY_EVAL3_HORNER(y, x, coeff) 93 #define POLY_EVAL6SINGLE(y, x, coeff) POLY_EVAL6SINGLE_ESTRIN(y, x, coeff) 94 #define POLY_EVAL6(y, x, coeff) POLY_EVAL6_ESTRIN(y, x, coeff) 103 #define _PS_CONST(Name) \ 104 extern const ALIGN16_BEG double _pd_##Name[2] ALIGN16_END 105 #define _PS_CONST_TYPE(Name, Type) \ 106 extern const ALIGN16_BEG Type _pd_##Name[2] ALIGN16_END 137 v2df sign = _mm_and_pd(x, sign_mask);
140 x = _mm_andnot_pd(sign_mask, x);
143 v2df modulo = _mm_mul_pd(x,
ARRAY2V2DF(_pd_2OPI));
147 v2di i = _mm_cvttpd_epi32 (modulo);
148 v2df frac = _mm_sub_pd(modulo, _mm_cvtepi32_pd(i));
149 i = _mm_shuffle_epi32 (i, _MM_SHUFFLE(1, 1, 0, 0));
152 v2di gt3 = _mm_slli_epi32(i, 30);
153 v2di mask = _mm_and_si128( gt3, _mm_castpd_si128(sign_mask));
154 sign = _mm_xor_pd(sign, _mm_castsi128_pd(mask));
157 v2di signmask2 = _mm_slli_epi64(i, 63);
158 v2di add =_mm_srli_epi32( _mm_srai_epi32(signmask2,9),2);
159 frac = _mm_add_pd(_mm_or_pd(frac, _mm_castsi128_pd(signmask2)), _mm_castsi128_pd(add));
162 v2df calcx = _mm_mul_pd(frac,
ARRAY2V2DF(_pd_PIO2));
165 v2df xx = _mm_mul_pd(calcx, calcx);
173 y = _mm_mul_pd(y,xx);
174 y = _mm_add_pd(_mm_mul_pd(y, calcx), calcx);
177 y = _mm_or_pd(sign, y);
187 v2df xx = _mm_mul_pd(x, x);
195 y = _mm_mul_pd(y,xx);
196 y = _mm_add_pd(_mm_mul_pd(y, x), x);
203 v2df r =
sin_pd(_mm_set1_pd(x));
204 _mm_storel_pd (&x, r);
211 _mm_storel_pd (&x, r);
215 inline void sin_pd(
double x,
double y,
double& res_x,
double& res_y){
216 v2df r =
sin_pd(_mm_setr_pd(y, x));
217 _mm_storel_pd (&res_x, r);
218 _mm_storeh_pd (&res_y, r);
224 _mm_storel_pd (&res_x, r);
225 _mm_storeh_pd (&res_y, r);
229 inline void sin_cos_pd(
double x,
double& res_sin,
double& res_cos){
230 v2df r =
sin_pd(_mm_setr_pd( x + M_PI/2.0, x));
231 _mm_storeh_pd (&res_sin, r);
232 _mm_storel_pd (&res_cos, r);
242 x = _mm_andnot_pd(sign_mask, x);
245 v2df modulo = _mm_mul_pd(x,
ARRAY2V2DF(_pd_2OPI));
249 v2di i = _mm_cvttpd_epi32 (modulo);
250 v2df frac = _mm_sub_pd(modulo, _mm_cvtepi32_pd(i));
251 i = _mm_shuffle_epi32 (i, _MM_SHUFFLE(1, 1, 0, 0));
254 v2di signmask2 = _mm_slli_epi64(i, 63);
255 v2di add =_mm_srli_epi32( _mm_srai_epi32(signmask2,9),2);
256 frac = _mm_add_pd(_mm_or_pd(frac, _mm_castsi128_pd(signmask2)), _mm_castsi128_pd(add));
259 v2di gt3 = _mm_slli_epi32(_mm_add_epi32(i,
ARRAY2V2DI(_pd_x01_double_mask)), 30);
260 v2df sign = _mm_and_pd( _mm_castsi128_pd(gt3), sign_mask);
263 v2df calcx = _mm_mul_pd(frac,
ARRAY2V2DF(_pd_PIO2));
266 v2df xx = _mm_mul_pd(calcx, calcx);
274 y = _mm_add_pd(_mm_mul_pd(y,xx),
ARRAY2V2DF(_pd_1));
276 y = _mm_or_pd(sign, y);
283 v2df r =
cos_pd(_mm_set1_pd(x));
284 _mm_storel_pd (&x, r);
288 inline double cos_pd(
double x,
double y,
double& res_x,
double& res_y){
289 v2df r =
cos_pd(_mm_setr_pd(y, x));
290 _mm_storel_pd (&res_x, r);
291 _mm_storeh_pd (&res_y, r);
314 v2di e_int = _mm_and_si128(_mm_castpd_si128(x),
ARRAY2V2DI(_pd_f_exp_mask));
315 e_int = _mm_srli_epi64(e_int,52);
316 e_int = _mm_or_si128(_mm_srli_si128(e_int,4), e_int);
317 e_int = _mm_sub_epi32(e_int,
ARRAY2V2DI(_pd_x03FE_double_mask));
318 v2df e = _mm_cvtepi32_pd( e_int);
328 v2df mask =_mm_cmplt_pd(x,
ARRAY2V2DF(_pd_0_87));
329 v2df ex = _mm_and_pd(mask,
ARRAY2V2DF(_pd_log_inv_1_32));
330 v2df mulx = _mm_mul_pd(x,
ARRAY2V2DF(_pd_1_32));
331 v2df v =_mm_or_pd( _mm_and_pd(mask, mulx), _mm_andnot_pd(mask, x));
338 ex = _mm_or_pd(_mm_and_pd(mask,
ARRAY2V2DF(_pd_log_inv_1_74)), _mm_andnot_pd(mask, ex));
340 v =_mm_or_pd( _mm_and_pd(mask, mulx), _mm_andnot_pd(mask, v));
344 v2df term = _mm_div_pd(_mm_sub_pd(v, ones), _mm_add_pd(v,ones));
347 v2df termsquared = _mm_mul_pd(term, term);
355 res = _mm_mul_pd(_mm_mul_pd(res,term), termsquared);
357 res = _mm_add_pd(res, term);
360 v2df r1 = _mm_mul_pd(e,
ARRAY2V2DF(_pd_LOG_C_2));
361 v2df r2 = _mm_mul_pd( _mm_add_pd(ones,ones), res);
362 return _mm_add_pd(r1, _mm_add_pd(r2, ex));
368 v2df r =
log_pd(_mm_set1_pd(x));
369 _mm_storel_pd (&x, r);
373 inline double log_pd(
double x,
double y,
double& res_x,
double& res_y){
374 v2df r =
log_pd(_mm_setr_pd(y, x));
375 _mm_storel_pd (&res_x, r);
376 _mm_storeh_pd (&res_y, r);
383 v2df param = _mm_div_pd( _mm_add_pd(ones,x), _mm_sub_pd(ones,x));
384 v2df logres=
log_pd(param);
385 return _mm_mul_pd(
ARRAY2V2DF(_pd_0_5), logres);
390 _mm_storel_pd (&x, r);
394 inline double atanh_pd(
double x,
double y,
double& res_x,
double& res_y){
395 v2df r =
atanh_pd(_mm_setr_pd(y, x));
396 _mm_storel_pd (&res_x, r);
397 _mm_storeh_pd (&res_y, r);
409 _mm_storel_pd (&x, r);
414 inline double atanh_sin_pd(
double x,
double y,
double& res_x,
double& res_y){
416 _mm_storel_pd (&res_x, r);
417 _mm_storeh_pd (&res_y, r);
#define POLY_EVAL3(y, x, coeff)
Definition: SSEMath.h:92
v2df atanh_pd(v2df x)
Definition: SSEMath.h:381
v2df atanh_sin_pd(v2df x)
Definition: SSEMath.h:403
#define ARRAY2V2DI(name)
Definition: SSEMath.h:36
v2df dangerous_sin_pd(v2df x)
Definition: SSEMath.h:185
v2df log_pd(v2df x)
Definition: SSEMath.h:311
#define ARRAY2V2DF(name)
Definition: SSEMath.h:35
v2df cos_pd(v2df x)
Definition: SSEMath.h:239
_PS_CONST_TYPE(sign_mask, uint64_t)
#define DECLARE_COEFFS(name)
Definition: SSEMath.h:38
v2df sin_pd(v2df x)
Definition: SSEMath.h:133
#define POLY_EVAL6(y, x, coeff)
Definition: SSEMath.h:94
void sin_cos_pd(double x, double &res_sin, double &res_cos)
Definition: SSEMath.h:229