libosmscout 1.1.1
Loading...
Searching...
No Matches
SSEMath.h
Go to the documentation of this file.
1#ifndef OSMSCOUT_SYSTEM_SSEMATH_H
2#define OSMSCOUT_SYSTEM_SSEMATH_H
3
4/*
5 This source is part of the libosmscout library
6 Copyright (C) 2009 Tim Teulings
7
8 This library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 2.1 of the License, or (at your option) any later version.
12
13 This library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public
19 License along with this library; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21*/
22
23#include <osmscout/lib/CoreFeatures.h>
24
28
29#include <osmscout/private/Config.h>
30
31namespace osmscout {
32
33#define ARRAY2V2DF(name) * reinterpret_cast<const v2df*>(name)
34#define ARRAY2V2DI(name) * reinterpret_cast<const v2di*>(name)
35
36#define DECLARE_COEFFS(name) extern const ALIGN16_BEG double name[] ALIGN16_END;
37
38#define POLY_EVAL3SINGLE_HORNER(y, x, coeff) \
39 y = coeff[0]; y = y*x + coeff[2]; y = y*x + coeff[4]; y = y*x + coeff[6]
40
41#define POLY_EVAL3_HORNER(y, x, coeff) \
42 y = ARRAY2V2DF(&coeff[0]); \
43 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[2])); \
44 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[4])); \
45 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[6]))
46
47#define POLY_EVAL6SINGLE_HORNER(y, x, coeff) \
48 y = coeff[0]; y = y*x + coeff[2]; y = y*x + coeff[4]; y = y*x + coeff[6]; \
49 y = y*x + coeff[8]; y = y*x + coeff[10]; y = y*x + coeff[12]
50
51#define POLY_EVAL6_HORNER(y, x, coeff) \
52 y = ARRAY2V2DF(&coeff[0]); \
53 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[2])); \
54 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[4])); \
55 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[6])); \
56 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[8])); \
57 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[10])); \
58 y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[12]))
59
60#define POLY_EVAL6SINGLE_ESTRIN(y, x, coeff) \
61 y = coeff[12]; y = y*x + coeff[10]; y = y*x + coeff[8]; y = y*x + coeff[6]; \
62 y = y*x + coeff[4]; y = y*x + coeff[2]; y = y*x + coeff[0]; \
63
64#define POLY_EVAL6_ESTRIN(y, x, coeff) \
65 v2df pt0 = _mm_add_pd(ARRAY2V2DF(&coeff[0]), _mm_mul_pd(ARRAY2V2DF(&coeff[ 2]), x)); \
66 v2df pt1 = _mm_add_pd(ARRAY2V2DF(&coeff[4]), _mm_mul_pd(ARRAY2V2DF(&coeff[ 6]), x)); \
67 v2df pt2 = _mm_add_pd(ARRAY2V2DF(&coeff[8]), _mm_mul_pd(ARRAY2V2DF(&coeff[10]), x)); \
68 v2df pt3 = ARRAY2V2DF(&coeff[12]); \
69 v2df ptx2 = _mm_mul_pd(x,x); \
70 pt0 = _mm_add_pd(pt0, _mm_mul_pd(pt1, ptx2)); \
71 pt2 = _mm_add_pd(pt2, _mm_mul_pd(pt3, ptx2)); \
72 v2df ptx4 = _mm_mul_pd(ptx2,ptx2); \
73 y = _mm_add_pd(pt0, _mm_mul_pd(pt2, ptx4))
74
75#ifdef _SSE_SLOW_ //For some cpu's this is faster
76
77//we assume the cpu can not be sped up by doing several calculations in parralel.
78//using Horner's method http://en.wikipedia.org/wiki/Horner_scheme
79
80#define POLY_EVAL3SINGLE(y, x, coeff) POLY_EVAL3SINGLE_HORNER(y, x, coeff)
81#define POLY_EVAL3(y, x, coeff) POLY_EVAL3_HORNER(y, x, coeff)
82#define POLY_EVAL6SINGLE(y, x, coeff) POLY_EVAL6SINGLE_HORNER(y, x, coeff)
83#define POLY_EVAL6(y, x, coeff) POLY_EVAL6_HORNER(y, x, coeff)
84
85#else
86//we assume the cpu can be sped up by doing calculations in parallel (several xmm registers)
87//using Estrin's scheme http://en.wikipedia.org/wiki/Estrin%27s_scheme
88
89#define POLY_EVAL3SINGLE(y, x, coeff) POLY_EVAL3SINGLE_HORNER(y, x, coeff)
90#define POLY_EVAL3(y, x, coeff) POLY_EVAL3_HORNER(y, x, coeff)
91#define POLY_EVAL6SINGLE(y, x, coeff) POLY_EVAL6SINGLE_ESTRIN(y, x, coeff)
92#define POLY_EVAL6(y, x, coeff) POLY_EVAL6_ESTRIN(y, x, coeff)
93
94#endif
95
96DECLARE_COEFFS(SINECOEFF_SSE)
97DECLARE_COEFFS(COSINECOEFF_SSE)
98DECLARE_COEFFS(LOGCOEFF)
99
100/* declare some SSE constants */
101#define _PS_CONST(Name) \
102 extern const ALIGN16_BEG double _pd_##Name[2] ALIGN16_END
103#define _PS_CONST_TYPE(Name, Type) \
104 extern const ALIGN16_BEG Type _pd_##Name[2] ALIGN16_END
105
111_PS_CONST(LOG_C_2);
112
113_PS_CONST_TYPE(sign_mask, uint64_t);
114_PS_CONST_TYPE(x01_double_mask, uint64_t);
115_PS_CONST_TYPE(x03FE_double_mask, uint64_t);
116_PS_CONST_TYPE(1_exp, uint64_t);
117_PS_CONST_TYPE(f_fraction_mask, uint64_t);
118_PS_CONST_TYPE(f_exp_mask, uint64_t);
119_PS_CONST_TYPE(f_one_mask, uint64_t);
120_PS_CONST_TYPE(1022, uint64_t);
125_PS_CONST(log_inv_1_74);
126_PS_CONST(log_inv_1_32);
127
128//approximate a sin using a 6th order polynomal function.
129//Use the mirroring properties of the sine to calculate
130//result from only [0..Pi/2] interval. This one can take an __m128d
131inline v2df sin_pd(v2df x)
132{
133 //bool sign = x < 0.0;
134 v2df sign_mask = ARRAY2V2DF(_pd_sign_mask);
135 v2df sign = _mm_and_pd(x, sign_mask);
136
137 //x = fabs(x);
138 x = _mm_andnot_pd(sign_mask, x);
139
140 //double modulo = x * 2.0/PI;
141 v2df modulo = _mm_mul_pd(x, ARRAY2V2DF(_pd_2OPI));
142
143 //int i = static_cast<int>(floor(modulo));
144 //double frac = modulo-i;
145 v2di i = _mm_cvttpd_epi32 (modulo);
146 v2df frac = _mm_sub_pd(modulo, _mm_cvtepi32_pd(i));
147 i = _mm_shuffle_epi32 (i, _MM_SHUFFLE(1, 1, 0, 0)); //resuffle to align
148
149 //if (i&2) sign = !sign;
150 v2di gt3 = _mm_slli_epi32(i, 30);
151 v2di mask = _mm_and_si128( gt3, _mm_castpd_si128(sign_mask));
152 sign = _mm_xor_pd(sign, _mm_castsi128_pd(mask));
153
154 //if (i & 1) frac = 1 - frac;
155 v2di signmask2 = _mm_slli_epi64(i, 63); //shift lsb to msb. Is sign position for doubles
156 v2di add =_mm_srli_epi32( _mm_srai_epi32(signmask2,9),2); //makes 1.0 if msb was set
157 frac = _mm_add_pd(_mm_or_pd(frac, _mm_castsi128_pd(signmask2)), _mm_castsi128_pd(add)); //(where msb is set negate frac) then add 1 if msb was set.
158
159 //double calcx = frac * PI/2;
160 v2df calcx = _mm_mul_pd(frac, ARRAY2V2DF(_pd_PIO2));
161
162 //double xx = calcx * calcx;
163 v2df xx = _mm_mul_pd(calcx, calcx);
164
165 //double y;
166 v2df y;
167
168 //y = SINECOEFF_SSE[0]; etc
169 POLY_EVAL6(y, xx, SINECOEFF_SSE);
170 // y = calcx * y*xx+ calcx;
171 y = _mm_mul_pd(y,xx);
172 y = _mm_add_pd(_mm_mul_pd(y, calcx), calcx);
173
174 //if (sign) y = -y;
175 y = _mm_or_pd(sign, y);
176
177 return y;
178}
179
180//an sse variant that does not check the input range.
181//THIS METHOD IS ONLY VALID ON [-Pi/2,Pi/2]
182//but it is 3 times faster as the one above, and more accurate.
183inline v2df dangerous_sin_pd(v2df x){
184 //double xx = x * x;
185 v2df xx = _mm_mul_pd(x, x);
186
187 //double y;
188 v2df y;
189
190 //y = SINECOEFF_SSE[0]; etc
191 POLY_EVAL6(y, xx, SINECOEFF_SSE);
192 // y = calcx * y*xx+ calcx;
193 y = _mm_mul_pd(y,xx);
194 y = _mm_add_pd(_mm_mul_pd(y, x), x);
195
196 return y;
197}
198
199//some helper functions to test
200inline double sin_pd(double x){
201 v2df r = sin_pd(_mm_set1_pd(x));
202 _mm_storel_pd (&x, r);
203 return x;
204}
205
206//THIS METHOD IS ONLY VALID ON [-Pi/2,Pi/2]
207inline double dangerous_sin_pd(double x){
208 v2df r = dangerous_sin_pd(_mm_set1_pd(x));
209 _mm_storel_pd (&x, r);
210 return x;
211}
212
213inline void sin_pd(double x, double y, double& res_x, double& res_y){
214 v2df r = sin_pd(_mm_setr_pd(y, x));
215 _mm_storel_pd (&res_x, r);
216 _mm_storeh_pd (&res_y, r);
217}
218
219//THIS METHOD IS ONLY VALID ON [-Pi/2,Pi/2]
220inline void dangerous_sin_pd(double x, double y, double& res_x, double& res_y){
221 v2df r = dangerous_sin_pd(_mm_setr_pd(y, x));
222 _mm_storel_pd (&res_x, r);
223 _mm_storeh_pd (&res_y, r);
224}
225
226//calculate a sine and cosine
227inline void sin_cos_pd(double x, double& res_sin, double& res_cos){
228 v2df r = sin_pd(_mm_setr_pd( x + M_PI/2.0, x));
229 _mm_storeh_pd (&res_sin, r);
230 _mm_storel_pd (&res_cos, r);
231}
232
233
234//approximate cosine using a 6th order polynomal function.
235//Use the mirroring properties of the cosine to calculate
236//result from only [0..Pi/2] interval. This one can take an __m128d
237inline v2df cos_pd(v2df x){
238 //x = fabs(x);
239 v2df sign_mask = ARRAY2V2DF(_pd_sign_mask);
240 x = _mm_andnot_pd(sign_mask, x);
241
242 //double modulo = x * 2.0/PI;
243 v2df modulo = _mm_mul_pd(x, ARRAY2V2DF(_pd_2OPI));
244
245 //int i = static_cast<int>(floor(modulo));
246 //double frac = modulo-i;
247 v2di i = _mm_cvttpd_epi32 (modulo);
248 v2df frac = _mm_sub_pd(modulo, _mm_cvtepi32_pd(i));
249 i = _mm_shuffle_epi32 (i, _MM_SHUFFLE(1, 1, 0, 0)); //resuffle to align
250
251 //if (i & 1) frac = 1 - frac;
252 v2di signmask2 = _mm_slli_epi64(i, 63); //shift lsb to msb. Is sign position for doubles
253 v2di add =_mm_srli_epi32( _mm_srai_epi32(signmask2,9),2); //makes 1.0 if msb was set
254 frac = _mm_add_pd(_mm_or_pd(frac, _mm_castsi128_pd(signmask2)), _mm_castsi128_pd(add)); //(where msb is set negate frac) then add 1 if msb was set.
255
256 //sign = ((i+1)&2)==2;
257 v2di gt3 = _mm_slli_epi32(_mm_add_epi32(i, ARRAY2V2DI(_pd_x01_double_mask)), 30);
258 v2df sign = _mm_and_pd( _mm_castsi128_pd(gt3), sign_mask);
259
260 //double calcx = frac * PI/2;
261 v2df calcx = _mm_mul_pd(frac, ARRAY2V2DF(_pd_PIO2));
262
263 //double xx = calcx * calcx;
264 v2df xx = _mm_mul_pd(calcx, calcx);
265
266 //double y;
267 v2df y;
268
269 //y = COSINECOEFF[0]; etc
270 POLY_EVAL6(y, xx, COSINECOEFF_SSE);
271 // y = y*xx + 1;
272 y = _mm_add_pd(_mm_mul_pd(y,xx), ARRAY2V2DF(_pd_1));
273
274 y = _mm_or_pd(sign, y);
275
276 return y;
277}
278
279//some helper functions
280inline double cos_pd(double x){
281 v2df r = cos_pd(_mm_set1_pd(x));
282 _mm_storel_pd (&x, r);
283 return x;
284}
285
286inline double cos_pd(double x, double y, double& res_x, double& res_y){
287 v2df r = cos_pd(_mm_setr_pd(y, x));
288 _mm_storel_pd (&res_x, r);
289 _mm_storeh_pd (&res_y, r);
290 return x;
291}
292
293//calculate log in sse form.
294//This is done by doing math :)
295
296//A double is stored as: 2^exp * mantise
297
298//using:
299// ln(x) = log2(x)/ln(2);
300// ln(a*b) = ln(a)+ln(b)
301// log2(2^exp) = exp
302//
303// we can simplefy to ln(2^exp * mantise) = ln(2^exp) * ln(mantise) = log2(2^exp)/ln(2) *ln(mantise) = exp * ln(2) * ln(mantise)
304// mantise is in range [0.5, 1.0[
305
306//by scaling this range so that is is around 1, we can use this method: http://en.wikipedia.org/wiki/Natural_logarithm#Numerical_value
307//that is fast converging around 1.
308
309inline v2df log_pd(v2df x)
310{
311 //x = frexp( x, &e );
312 v2di e_int = _mm_and_si128(_mm_castpd_si128(x), ARRAY2V2DI(_pd_f_exp_mask));
313 e_int = _mm_srli_epi64(e_int,52);
314 e_int = _mm_or_si128(_mm_srli_si128(e_int,4), e_int);
315 e_int = _mm_sub_epi32(e_int, ARRAY2V2DI(_pd_x03FE_double_mask));
316 v2df e = _mm_cvtepi32_pd( e_int);
317 x = _mm_or_pd(_mm_and_pd(x, ARRAY2V2DF(_pd_f_fraction_mask)), ARRAY2V2DF(_pd_f_one_mask));
318
319 //double ex = 0.0;
320 //v = x;
321
322 //if (x < 0.66) {
323 // ex = l1;
324 // v = x*1.74;
325 //}
326 v2df mask =_mm_cmplt_pd(x, ARRAY2V2DF(_pd_0_87));
327 v2df ex = _mm_and_pd(mask, ARRAY2V2DF(_pd_log_inv_1_32));
328 v2df mulx = _mm_mul_pd(x,ARRAY2V2DF(_pd_1_32));
329 v2df v =_mm_or_pd( _mm_and_pd(mask, mulx), _mm_andnot_pd(mask, x));
330
331 //else if (x < 0.87) {
332 // ex = l2;
333 // v = x* 1.32;
334 //}
335 mask =_mm_cmplt_pd(x, ARRAY2V2DF(_pd_0_66));
336 ex = _mm_or_pd(_mm_and_pd(mask, ARRAY2V2DF(_pd_log_inv_1_74)), _mm_andnot_pd(mask, ex));
337 mulx = _mm_mul_pd(x,ARRAY2V2DF(_pd_1_74));
338 v =_mm_or_pd( _mm_and_pd(mask, mulx), _mm_andnot_pd(mask, v));
339
340 //double term = (v-1)/(v+1);
341 v2df ones = ARRAY2V2DF(_pd_1);
342 v2df term = _mm_div_pd(_mm_sub_pd(v, ones), _mm_add_pd(v,ones));
343
344 //double termsquared = term*term;
345 v2df termsquared = _mm_mul_pd(term, term);
346
347 //double z = term;
348
349 //double res = 1.0/9.0;
350 v2df res;
351 POLY_EVAL3(res, termsquared , LOGCOEFF);
352 //res *= z * termsquared;
353 res = _mm_mul_pd(_mm_mul_pd(res,term), termsquared);
354 //res += z;
355 res = _mm_add_pd(res, term);
356
357 //return (e * _pd_LOG_C_2[0])+ex+2* res;
358 v2df r1 = _mm_mul_pd(e, ARRAY2V2DF(_pd_LOG_C_2));
359 v2df r2 = _mm_mul_pd( _mm_add_pd(ones,ones), res);
360 return _mm_add_pd(r1, _mm_add_pd(r2, ex));
361}
362
363
364//help functions
365inline double log_pd(double x){
366 v2df r = log_pd(_mm_set1_pd(x));
367 _mm_storel_pd (&x, r);
368 return x;
369}
370
371inline double log_pd(double x, double y, double& res_x, double& res_y){
372 v2df r = log_pd(_mm_setr_pd(y, x));
373 _mm_storel_pd (&res_x, r);
374 _mm_storeh_pd (&res_y, r);
375 return x;
376}
377
378//calculate atanh
379inline v2df atanh_pd(v2df x){
380 v2df ones = ARRAY2V2DF(_pd_1);
381 v2df param = _mm_div_pd( _mm_add_pd(ones,x), _mm_sub_pd(ones,x));
382 v2df logres=log_pd(param);
383 return _mm_mul_pd(ARRAY2V2DF(_pd_0_5), logres);
384}
385
386inline double atanh_pd(double x){
387 v2df r = atanh_pd(_mm_set1_pd(x));
388 _mm_storel_pd (&x, r);
389 return x;
390}
391
392inline double atanh_pd(double x, double y, double& res_x, double& res_y){
393 v2df r = atanh_pd(_mm_setr_pd(y, x));
394 _mm_storel_pd (&res_x, r);
395 _mm_storeh_pd (&res_y, r);
396 return x;
397}
398
399//calculate atan(sin(x))
400//note that this is only good between 0.944*-Pi/2 < x < 0.944* pi/2 which is up to 85 degrees
401inline v2df atanh_sin_pd(v2df x){
402 return atanh_pd(dangerous_sin_pd(x));
403}
404
405inline double atanh_sin_pd(double x){
406 v2df r = atanh_sin_pd(_mm_set1_pd(x));
407 _mm_storel_pd (&x, r);
408 return x;
409}
410
411//calculate atan(sin(x)) and atan(sin(y))
412inline double atanh_sin_pd(double x, double y, double& res_x, double& res_y){
413 v2df r = atanh_sin_pd(_mm_setr_pd(y, x));
414 _mm_storel_pd (&res_x, r);
415 _mm_storeh_pd (&res_y, r);
416 return x;
417}
418
419}
420#endif
#define _PS_CONST(Name)
Definition SSEMath.h:101
#define ARRAY2V2DF(name)
Definition SSEMath.h:33
#define POLY_EVAL3(y, x, coeff)
Definition SSEMath.h:90
#define DECLARE_COEFFS(name)
Definition SSEMath.h:36
#define ARRAY2V2DI(name)
Definition SSEMath.h:34
#define _PS_CONST_TYPE(Name, Type)
Definition SSEMath.h:103
#define POLY_EVAL6(y, x, coeff)
Definition SSEMath.h:92
Definition Area.h:39
v2df atanh_sin_pd(v2df x)
Definition SSEMath.h:401
v2df cos_pd(v2df x)
Definition SSEMath.h:237
v2df sin_pd(v2df x)
Definition SSEMath.h:131
void sin_cos_pd(double x, double &res_sin, double &res_cos)
Definition SSEMath.h:227
v2df atanh_pd(v2df x)
Definition SSEMath.h:379
v2df log_pd(v2df x)
Definition SSEMath.h:309
v2df dangerous_sin_pd(v2df x)
Definition SSEMath.h:183