libosmscout  1.1.1
SSEMath.h
Go to the documentation of this file.
1 #ifndef OSMSCOUT_SYSTEM_SSEMATH_H
2 #define OSMSCOUT_SYSTEM_SSEMATH_H
3 
4 /*
5  This source is part of the libosmscout library
6  Copyright (C) 2009 Tim Teulings
7 
8  This library is free software; you can redistribute it and/or
9  modify it under the terms of the GNU Lesser General Public
10  License as published by the Free Software Foundation; either
11  version 2.1 of the License, or (at your option) any later version.
12 
13  This library is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  Lesser General Public License for more details.
17 
18  You should have received a copy of the GNU Lesser General Public
19  License along with this library; if not, write to the Free Software
20  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22 
23 #include <osmscout/CoreFeatures.h>
24 
25 #include <osmscout/system/Math.h>
26 
28 
30 
31 #include <osmscout/private/Config.h>
32 
33 namespace osmscout {
34 
35 #define ARRAY2V2DF(name) * reinterpret_cast<const v2df*>(name)
36 #define ARRAY2V2DI(name) * reinterpret_cast<const v2di*>(name)
37 
38 #define DECLARE_COEFFS(name) extern const ALIGN16_BEG double name[] ALIGN16_END;
39 
40 #define POLY_EVAL3SINGLE_HORNER(y, x, coeff) \
41  y = coeff[0]; y = y*x + coeff[2]; y = y*x + coeff[4]; y = y*x + coeff[6]
42 
43 #define POLY_EVAL3_HORNER(y, x, coeff) \
44  y = ARRAY2V2DF(&coeff[0]); \
45  y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[2])); \
46  y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[4])); \
47  y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[6]))
48 
49 #define POLY_EVAL6SINGLE_HORNER(y, x, coeff) \
50  y = coeff[0]; y = y*x + coeff[2]; y = y*x + coeff[4]; y = y*x + coeff[6]; \
51  y = y*x + coeff[8]; y = y*x + coeff[10]; y = y*x + coeff[12]
52 
53 #define POLY_EVAL6_HORNER(y, x, coeff) \
54  y = ARRAY2V2DF(&coeff[0]); \
55  y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[2])); \
56  y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[4])); \
57  y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[6])); \
58  y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[8])); \
59  y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[10])); \
60  y = _mm_add_pd(_mm_mul_pd(y, x), ARRAY2V2DF(&coeff[12]))
61 
62 #define POLY_EVAL6SINGLE_ESTRIN(y, x, coeff) \
63  y = coeff[12]; y = y*x + coeff[10]; y = y*x + coeff[8]; y = y*x + coeff[6]; \
64  y = y*x + coeff[4]; y = y*x + coeff[2]; y = y*x + coeff[0]; \
65 
66 #define POLY_EVAL6_ESTRIN(y, x, coeff) \
67  v2df pt0 = _mm_add_pd(ARRAY2V2DF(&coeff[0]), _mm_mul_pd(ARRAY2V2DF(&coeff[ 2]), x)); \
68  v2df pt1 = _mm_add_pd(ARRAY2V2DF(&coeff[4]), _mm_mul_pd(ARRAY2V2DF(&coeff[ 6]), x)); \
69  v2df pt2 = _mm_add_pd(ARRAY2V2DF(&coeff[8]), _mm_mul_pd(ARRAY2V2DF(&coeff[10]), x)); \
70  v2df pt3 = ARRAY2V2DF(&coeff[12]); \
71  v2df ptx2 = _mm_mul_pd(x,x); \
72  pt0 = _mm_add_pd(pt0, _mm_mul_pd(pt1, ptx2)); \
73  pt2 = _mm_add_pd(pt2, _mm_mul_pd(pt3, ptx2)); \
74  v2df ptx4 = _mm_mul_pd(ptx2,ptx2); \
75  y = _mm_add_pd(pt0, _mm_mul_pd(pt2, ptx4))
76 
77 #ifdef _SSE_SLOW_ //For some cpu's this is faster
78 
79 //we assume the cpu can not be sped up by doing several calculations in parralel.
80 //using Horner's method http://en.wikipedia.org/wiki/Horner_scheme
81 
82 #define POLY_EVAL3SINGLE(y, x, coeff) POLY_EVAL3SINGLE_HORNER(y, x, coeff)
83 #define POLY_EVAL3(y, x, coeff) POLY_EVAL3_HORNER(y, x, coeff)
84 #define POLY_EVAL6SINGLE(y, x, coeff) POLY_EVAL6SINGLE_HORNER(y, x, coeff)
85 #define POLY_EVAL6(y, x, coeff) POLY_EVAL6_HORNER(y, x, coeff)
86 
87 #else
88 //we assume the cpu can be sped up by doing calculations in parallel (several xmm registers)
89 //using Estrin's scheme http://en.wikipedia.org/wiki/Estrin%27s_scheme
90 
91 #define POLY_EVAL3SINGLE(y, x, coeff) POLY_EVAL3SINGLE_HORNER(y, x, coeff)
92 #define POLY_EVAL3(y, x, coeff) POLY_EVAL3_HORNER(y, x, coeff)
93 #define POLY_EVAL6SINGLE(y, x, coeff) POLY_EVAL6SINGLE_ESTRIN(y, x, coeff)
94 #define POLY_EVAL6(y, x, coeff) POLY_EVAL6_ESTRIN(y, x, coeff)
95 
96 #endif
97 
98 DECLARE_COEFFS(SINECOEFF_SSE)
99 DECLARE_COEFFS(COSINECOEFF_SSE)
100 DECLARE_COEFFS(LOGCOEFF)
101 
102 /* declare some SSE constants */
103 #define _PS_CONST(Name) \
104  extern const ALIGN16_BEG double _pd_##Name[2] ALIGN16_END
105 #define _PS_CONST_TYPE(Name, Type) \
106  extern const ALIGN16_BEG Type _pd_##Name[2] ALIGN16_END
107 
108 _PS_CONST(1);
109 _PS_CONST(2);
110 _PS_CONST(0_5);
111 _PS_CONST(2OPI);
112 _PS_CONST(PIO2);
113 _PS_CONST(LOG_C_2);
114 
115 _PS_CONST_TYPE(sign_mask, uint64_t);
116 _PS_CONST_TYPE(x01_double_mask, uint64_t);
117 _PS_CONST_TYPE(x03FE_double_mask, uint64_t);
118 _PS_CONST_TYPE(1_exp, uint64_t);
119 _PS_CONST_TYPE(f_fraction_mask, uint64_t);
120 _PS_CONST_TYPE(f_exp_mask, uint64_t);
121 _PS_CONST_TYPE(f_one_mask, uint64_t);
122 _PS_CONST_TYPE(1022, uint64_t);
123 _PS_CONST(0_66);
124 _PS_CONST(0_87);
125 _PS_CONST(1_74);
126 _PS_CONST(1_32);
127 _PS_CONST(log_inv_1_74);
128 _PS_CONST(log_inv_1_32);
129 
130 //approximate a sin using a 6th order polynomal function.
131 //Use the mirroring properties of the sine to calculate
132 //result from only [0..Pi/2] interval. This one can take an __m128d
133 inline v2df sin_pd(v2df x)
134 {
135  //bool sign = x < 0.0;
136  v2df sign_mask = ARRAY2V2DF(_pd_sign_mask);
137  v2df sign = _mm_and_pd(x, sign_mask);
138 
139  //x = fabs(x);
140  x = _mm_andnot_pd(sign_mask, x);
141 
142  //double modulo = x * 2.0/PI;
143  v2df modulo = _mm_mul_pd(x, ARRAY2V2DF(_pd_2OPI));
144 
145  //int i = static_cast<int>(floor(modulo));
146  //double frac = modulo-i;
147  v2di i = _mm_cvttpd_epi32 (modulo);
148  v2df frac = _mm_sub_pd(modulo, _mm_cvtepi32_pd(i));
149  i = _mm_shuffle_epi32 (i, _MM_SHUFFLE(1, 1, 0, 0)); //resuffle to align
150 
151  //if (i&2) sign = !sign;
152  v2di gt3 = _mm_slli_epi32(i, 30);
153  v2di mask = _mm_and_si128( gt3, _mm_castpd_si128(sign_mask));
154  sign = _mm_xor_pd(sign, _mm_castsi128_pd(mask));
155 
156  //if (i & 1) frac = 1 - frac;
157  v2di signmask2 = _mm_slli_epi64(i, 63); //shift lsb to msb. Is sign position for doubles
158  v2di add =_mm_srli_epi32( _mm_srai_epi32(signmask2,9),2); //makes 1.0 if msb was set
159  frac = _mm_add_pd(_mm_or_pd(frac, _mm_castsi128_pd(signmask2)), _mm_castsi128_pd(add)); //(where msb is set negate frac) then add 1 if msb was set.
160 
161  //double calcx = frac * PI/2;
162  v2df calcx = _mm_mul_pd(frac, ARRAY2V2DF(_pd_PIO2));
163 
164  //double xx = calcx * calcx;
165  v2df xx = _mm_mul_pd(calcx, calcx);
166 
167  //double y;
168  v2df y;
169 
170  //y = SINECOEFF_SSE[0]; etc
171  POLY_EVAL6(y, xx, SINECOEFF_SSE);
172  // y = calcx * y*xx+ calcx;
173  y = _mm_mul_pd(y,xx);
174  y = _mm_add_pd(_mm_mul_pd(y, calcx), calcx);
175 
176  //if (sign) y = -y;
177  y = _mm_or_pd(sign, y);
178 
179  return y;
180 }
181 
182 //an sse variant that does not check the input range.
183 //THIS METHOD IS ONLY VALID ON [-Pi/2,Pi/2]
184 //but it is 3 times faster as the one above, and more accurate.
185 inline v2df dangerous_sin_pd(v2df x){
186  //double xx = x * x;
187  v2df xx = _mm_mul_pd(x, x);
188 
189  //double y;
190  v2df y;
191 
192  //y = SINECOEFF_SSE[0]; etc
193  POLY_EVAL6(y, xx, SINECOEFF_SSE);
194  // y = calcx * y*xx+ calcx;
195  y = _mm_mul_pd(y,xx);
196  y = _mm_add_pd(_mm_mul_pd(y, x), x);
197 
198  return y;
199 }
200 
201 //some helper functions to test
202 inline double sin_pd(double x){
203  v2df r = sin_pd(_mm_set1_pd(x));
204  _mm_storel_pd (&x, r);
205  return x;
206 }
207 
208 //THIS METHOD IS ONLY VALID ON [-Pi/2,Pi/2]
209 inline double dangerous_sin_pd(double x){
210  v2df r = dangerous_sin_pd(_mm_set1_pd(x));
211  _mm_storel_pd (&x, r);
212  return x;
213 }
214 
215 inline void sin_pd(double x, double y, double& res_x, double& res_y){
216  v2df r = sin_pd(_mm_setr_pd(y, x));
217  _mm_storel_pd (&res_x, r);
218  _mm_storeh_pd (&res_y, r);
219 }
220 
221 //THIS METHOD IS ONLY VALID ON [-Pi/2,Pi/2]
222 inline void dangerous_sin_pd(double x, double y, double& res_x, double& res_y){
223  v2df r = dangerous_sin_pd(_mm_setr_pd(y, x));
224  _mm_storel_pd (&res_x, r);
225  _mm_storeh_pd (&res_y, r);
226 }
227 
228 //calculate a sine and cosine
229 inline void sin_cos_pd(double x, double& res_sin, double& res_cos){
230  v2df r = sin_pd(_mm_setr_pd( x + M_PI/2.0, x));
231  _mm_storeh_pd (&res_sin, r);
232  _mm_storel_pd (&res_cos, r);
233 }
234 
235 
236 //approximate cosine using a 6th order polynomal function.
237 //Use the mirroring properties of the cosine to calculate
238 //result from only [0..Pi/2] interval. This one can take an __m128d
239 inline v2df cos_pd(v2df x){
240  //x = fabs(x);
241  v2df sign_mask = ARRAY2V2DF(_pd_sign_mask);
242  x = _mm_andnot_pd(sign_mask, x);
243 
244  //double modulo = x * 2.0/PI;
245  v2df modulo = _mm_mul_pd(x, ARRAY2V2DF(_pd_2OPI));
246 
247  //int i = static_cast<int>(floor(modulo));
248  //double frac = modulo-i;
249  v2di i = _mm_cvttpd_epi32 (modulo);
250  v2df frac = _mm_sub_pd(modulo, _mm_cvtepi32_pd(i));
251  i = _mm_shuffle_epi32 (i, _MM_SHUFFLE(1, 1, 0, 0)); //resuffle to align
252 
253  //if (i & 1) frac = 1 - frac;
254  v2di signmask2 = _mm_slli_epi64(i, 63); //shift lsb to msb. Is sign position for doubles
255  v2di add =_mm_srli_epi32( _mm_srai_epi32(signmask2,9),2); //makes 1.0 if msb was set
256  frac = _mm_add_pd(_mm_or_pd(frac, _mm_castsi128_pd(signmask2)), _mm_castsi128_pd(add)); //(where msb is set negate frac) then add 1 if msb was set.
257 
258  //sign = ((i+1)&2)==2;
259  v2di gt3 = _mm_slli_epi32(_mm_add_epi32(i, ARRAY2V2DI(_pd_x01_double_mask)), 30);
260  v2df sign = _mm_and_pd( _mm_castsi128_pd(gt3), sign_mask);
261 
262  //double calcx = frac * PI/2;
263  v2df calcx = _mm_mul_pd(frac, ARRAY2V2DF(_pd_PIO2));
264 
265  //double xx = calcx * calcx;
266  v2df xx = _mm_mul_pd(calcx, calcx);
267 
268  //double y;
269  v2df y;
270 
271  //y = COSINECOEFF[0]; etc
272  POLY_EVAL6(y, xx, COSINECOEFF_SSE);
273  // y = y*xx + 1;
274  y = _mm_add_pd(_mm_mul_pd(y,xx), ARRAY2V2DF(_pd_1));
275 
276  y = _mm_or_pd(sign, y);
277 
278  return y;
279 }
280 
281 //some helper functions
282 inline double cos_pd(double x){
283  v2df r = cos_pd(_mm_set1_pd(x));
284  _mm_storel_pd (&x, r);
285  return x;
286 }
287 
288 inline double cos_pd(double x, double y, double& res_x, double& res_y){
289  v2df r = cos_pd(_mm_setr_pd(y, x));
290  _mm_storel_pd (&res_x, r);
291  _mm_storeh_pd (&res_y, r);
292  return x;
293 }
294 
295 //calculate log in sse form.
296 //This is done by doing math :)
297 
298 //A double is stored as: 2^exp * mantise
299 
300 //using:
301 // ln(x) = log2(x)/ln(2);
302 // ln(a*b) = ln(a)+ln(b)
303 // log2(2^exp) = exp
304 //
305 // we can simplefy to ln(2^exp * mantise) = ln(2^exp) * ln(mantise) = log2(2^exp)/ln(2) *ln(mantise) = exp * ln(2) * ln(mantise)
306 // mantise is in range [0.5, 1.0[
307 
308 //by scaling this range so that is is around 1, we can use this method: http://en.wikipedia.org/wiki/Natural_logarithm#Numerical_value
309 //that is fast converging around 1.
310 
311 inline v2df log_pd(v2df x)
312 {
313  //x = frexp( x, &e );
314  v2di e_int = _mm_and_si128(_mm_castpd_si128(x), ARRAY2V2DI(_pd_f_exp_mask));
315  e_int = _mm_srli_epi64(e_int,52);
316  e_int = _mm_or_si128(_mm_srli_si128(e_int,4), e_int);
317  e_int = _mm_sub_epi32(e_int, ARRAY2V2DI(_pd_x03FE_double_mask));
318  v2df e = _mm_cvtepi32_pd( e_int);
319  x = _mm_or_pd(_mm_and_pd(x, ARRAY2V2DF(_pd_f_fraction_mask)), ARRAY2V2DF(_pd_f_one_mask));
320 
321  //double ex = 0.0;
322  //v = x;
323 
324  //if (x < 0.66) {
325  // ex = l1;
326  // v = x*1.74;
327  //}
328  v2df mask =_mm_cmplt_pd(x, ARRAY2V2DF(_pd_0_87));
329  v2df ex = _mm_and_pd(mask, ARRAY2V2DF(_pd_log_inv_1_32));
330  v2df mulx = _mm_mul_pd(x,ARRAY2V2DF(_pd_1_32));
331  v2df v =_mm_or_pd( _mm_and_pd(mask, mulx), _mm_andnot_pd(mask, x));
332 
333  //else if (x < 0.87) {
334  // ex = l2;
335  // v = x* 1.32;
336  //}
337  mask =_mm_cmplt_pd(x, ARRAY2V2DF(_pd_0_66));
338  ex = _mm_or_pd(_mm_and_pd(mask, ARRAY2V2DF(_pd_log_inv_1_74)), _mm_andnot_pd(mask, ex));
339  mulx = _mm_mul_pd(x,ARRAY2V2DF(_pd_1_74));
340  v =_mm_or_pd( _mm_and_pd(mask, mulx), _mm_andnot_pd(mask, v));
341 
342  //double term = (v-1)/(v+1);
343  v2df ones = ARRAY2V2DF(_pd_1);
344  v2df term = _mm_div_pd(_mm_sub_pd(v, ones), _mm_add_pd(v,ones));
345 
346  //double termsquared = term*term;
347  v2df termsquared = _mm_mul_pd(term, term);
348 
349  //double z = term;
350 
351  //double res = 1.0/9.0;
352  v2df res;
353  POLY_EVAL3(res, termsquared , LOGCOEFF);
354  //res *= z * termsquared;
355  res = _mm_mul_pd(_mm_mul_pd(res,term), termsquared);
356  //res += z;
357  res = _mm_add_pd(res, term);
358 
359  //return (e * _pd_LOG_C_2[0])+ex+2* res;
360  v2df r1 = _mm_mul_pd(e, ARRAY2V2DF(_pd_LOG_C_2));
361  v2df r2 = _mm_mul_pd( _mm_add_pd(ones,ones), res);
362  return _mm_add_pd(r1, _mm_add_pd(r2, ex));
363 }
364 
365 
366 //help functions
367 inline double log_pd(double x){
368  v2df r = log_pd(_mm_set1_pd(x));
369  _mm_storel_pd (&x, r);
370  return x;
371 }
372 
373 inline double log_pd(double x, double y, double& res_x, double& res_y){
374  v2df r = log_pd(_mm_setr_pd(y, x));
375  _mm_storel_pd (&res_x, r);
376  _mm_storeh_pd (&res_y, r);
377  return x;
378 }
379 
380 //calculate atanh
381 inline v2df atanh_pd(v2df x){
382  v2df ones = ARRAY2V2DF(_pd_1);
383  v2df param = _mm_div_pd( _mm_add_pd(ones,x), _mm_sub_pd(ones,x));
384  v2df logres=log_pd(param);
385  return _mm_mul_pd(ARRAY2V2DF(_pd_0_5), logres);
386 }
387 
388 inline double atanh_pd(double x){
389  v2df r = atanh_pd(_mm_set1_pd(x));
390  _mm_storel_pd (&x, r);
391  return x;
392 }
393 
394 inline double atanh_pd(double x, double y, double& res_x, double& res_y){
395  v2df r = atanh_pd(_mm_setr_pd(y, x));
396  _mm_storel_pd (&res_x, r);
397  _mm_storeh_pd (&res_y, r);
398  return x;
399 }
400 
401 //calculate atan(sin(x))
402 //note that this is only good between 0.944*-Pi/2 < x < 0.944* pi/2 which is up to 85 degrees
403 inline v2df atanh_sin_pd(v2df x){
404  return atanh_pd(dangerous_sin_pd(x));
405 }
406 
407 inline double atanh_sin_pd(double x){
408  v2df r = atanh_sin_pd(_mm_set1_pd(x));
409  _mm_storel_pd (&x, r);
410  return x;
411 }
412 
413 //calculate atan(sin(x)) and atan(sin(y))
414 inline double atanh_sin_pd(double x, double y, double& res_x, double& res_y){
415  v2df r = atanh_sin_pd(_mm_setr_pd(y, x));
416  _mm_storel_pd (&res_x, r);
417  _mm_storeh_pd (&res_y, r);
418  return x;
419 }
420 
421 }
422 #endif
#define POLY_EVAL3(y, x, coeff)
Definition: SSEMath.h:92
v2df atanh_pd(v2df x)
Definition: SSEMath.h:381
v2df atanh_sin_pd(v2df x)
Definition: SSEMath.h:403
#define ARRAY2V2DI(name)
Definition: SSEMath.h:36
v2df dangerous_sin_pd(v2df x)
Definition: SSEMath.h:185
v2df log_pd(v2df x)
Definition: SSEMath.h:311
#define ARRAY2V2DF(name)
Definition: SSEMath.h:35
Definition: Area.h:38
v2df cos_pd(v2df x)
Definition: SSEMath.h:239
_PS_CONST_TYPE(sign_mask, uint64_t)
#define DECLARE_COEFFS(name)
Definition: SSEMath.h:38
v2df sin_pd(v2df x)
Definition: SSEMath.h:133
#define POLY_EVAL6(y, x, coeff)
Definition: SSEMath.h:94
void sin_cos_pd(double x, double &res_sin, double &res_cos)
Definition: SSEMath.h:229