// absbench.cpp : Definiert den Einstiegspunkt fr die Konsolenanwendung.
//

#include "stdafx.h"
#include <Windows.h>
#include <intrin.h> 
#include <emmintrin.h>
#include "compilerinfo.h"

#define N 0x100000
#define usearray

union field { 
UCHAR  f8[8*N];
WORD   f16[4*N]; 
UINT   f32[2*N];
UINT64 f64[N];
} f;



UCHAR hamming[65536];

inline int popcntx(UINT x){
int popcount;
for (popcount = 0; x; x = x & (x-1), popcount++);
return popcount;
}

inline int popcnty (UINT x ) {
  static unsigned int mask[] = {
    0x55555555,
    0x33333333,
    0x0F0F0F0F,
    0x00FF00FF,
    0x0000FFFF
  };
 
  int i ;
  int shift ; 
  for (i = 0, shift = 1; i < 5; i++, shift *= 2)
    x = (x & mask[i]) + ((x >> shift) & mask[i]);
  return x;
}



inline __int64 popcntx64 (UINT64 x) { 
int popcount;
for (popcount = 0; x; x = x & (x-1), popcount++);
return popcount;
}

#ifdef usearray
#define ops N
_int64 runloop32  (int count) {
	int x=0;
	for (int i=0; i<count; i++) x+=__popcnt (f.f32[i]);
	return x;
}

__int64 runloopm32  (int count) {
	int x=0;
	for (int i=0; i<count; i++) x+=_mm_popcnt_u32 (f.f32[i]);
	return x;
}

__int64 runloop16  (int count) {
	int x=0;
	for (int i=0; i<count; i++) x+=__popcnt16 (f.f16[i]);
	return x;
}

__int64 runlooplut8  (int count) {
	int x=0;
	for (int i=0; i<count; i++) x+=hamming [f.f8[i]];
	return x;
}

_int64 runlooplut16  (int count) {
	int x=0;
     for (int i=0; i<count; i++) x+=hamming [f.f16[i]];
	return x;
}
__int64 runlooplut2x8  (int count) {
	int x=0;
	 for (int i=0; i<count; i++) x+=hamming [(UCHAR) f.f16[i]]+hamming[f.f16[i] >> 8];
	return x;
}

__int64 runlooplut4x8  (int count) {
	int x=0;
	 for (int i=0; i<count; i++) 
		 x+=hamming [(UCHAR) f.f32[i]]+hamming[(UCHAR)(f.f32[i] >> 8)] 
	       +hamming [(UCHAR) (f.f32[i]>>16)]+hamming[(UCHAR)(f.f32[i] >> 24)]; 
	return x;
}

#if defined WIN64 || defined _WIN64
__int64 runloop64  (int count) {
	__int64 x=0;
	for (int i=0; i<count; i++) x+=__popcnt64 (f.f64[i]);
	return x;
}


__int64 runloopm64  (int count) {
	__int64 x=0;; 
	for (int i=0; i<count; i++) x+=_mm_popcnt_u64 (f.f64[i]);
	return x;
}
#endif

__int64 runloopx32  (int count) {
	int x=0;
	for (int i=0; i<count; i++) x+=popcntx (f.f32[i]);
	return x;
}

__int64 runloopy32  (int count) {
	int x=0;
	for (int i=0; i<count; i++) x+=popcnty (f.f32[i]);
	return x;
}

__int64 runloopx64  (int count) {
	__int64 x=0;
	for (int i=0; i<count; i++) x+=popcntx64 (f.f64[i]);
	return x;
}

#ifdef __INTEL_COMPILER

// No unrolling for Intel Compiler

__int64 runloop32_0  (int count) {
	int x=0;

   #pragma nounroll
	for (int i=0; i<count; i++) x+=__popcnt (f.f32[i]);
	return x;
}

#if defined WIN64 || defined _WIN64
__int64 runloop64_0  (int count) {
	__int64 x=0;
    #pragma nounroll
	for (int i=0; i<count; i++) x+=__popcnt64 (f.f64[i]);
	return x;
}
#endif

__int64 runloopx64_0  (int count) {
	__int64 x=0;
 #pragma nounroll
	for (int i=0; i<count; i++) x+=popcntx64 (f.f64[i]);
	return x;
}
__int64 runloopx32_0  (int count) {
	int x=0;
  #pragma nounroll
	for (int i=0; i<count; i++) x+=popcntx (f.f32[i]);
	return x;
}

_int64 runloop32_0n  (int count) {
	int x=0;

   #pragma nounroll
   #pragma novector
	for (int i=0; i<count; i++) x+=__popcnt (f.f32[i]);
	return x;
}
#if defined WIN64 || defined _WIN64
__int64 runloop64_0n  (int count) {
	__int64 x=0;
    #pragma nounroll
	#pragma novector--
	for (int i=0; i<count; i++) x+=__popcnt64 (f.f64[i]);
	return x;
}
#endif

__int64 runloopx64_0n  (int count) {
	__int64 x=0;
 #pragma nounroll
 #pragma novector
	for (int i=0; i<count; i++) x+=popcntx64 (f.f64[i]);
	return x;
}


__int64 runloopx32_0n  (int count) {
	int x=0;
  #pragma nounroll
  #pragma novector
	for (int i=0; i<count; i++) x+=popcntx (f.f64[i]);
	return x;
}

#endif


#else // no array 
#define ops 2*N+1
__int64 runloop32  (int count) {
	int x=0;
	for (int i=-count; i<=count; i++) x+=__popcnt (i);
	return x;
}

__int64 runloopm32  (int count) {
	int x=0;
	for (int i=-count; i<=count; i++) x+=_mm_popcnt_u32 (i);
	return x;
}

__int64 runloop16  (int count) {
	int x=0;
	for (int i=-count; i<=count; i++) x+=__popcnt16 (i);
	return x;
}

__int64 runlooplut8  (int count) {
	int x=0;
	int count8=count >> 8;
	for (int j=-count8; j <=count8;j++) for (int i=0; i<256; i++) x+=hamming [i];
	return x;
}

_int64 runlooplut16  (int count) {
	int x=0;
	int count16=count >> 16;
	for (int j=-count16; j <=count16;j++) for (int i=0; i<65536; i++) x+=hamming [i];
	return x;
}
__int64 runlooplut2x8  (int count) {
	int x=0;
	int count16=count >> 16;
	for (int j=-count16; j <=count16;j++) for (int i=0; i<65536; i++) x+=hamming [(UCHAR) i]+hamming[i >> 8];
	return x;
}



#if defined WIN64 || defined _WIN64
__int64 runloop64  (int count) {
	__int64 x=0;
	for (__int64 i=-count; i<=count; i++) x+=__popcnt64 (i);//	_mm_popcnt_u64 (i)
	return x;
}


__int64 runloopm64  (int count) {
	__int64 x=0;; 
	for (_int64 i=-count; i<=count; i++) x+=_mm_popcnt_u64 (i);
	return x;
}
#endif

__int64 runloopx32  (int count) {
	int x=0;
	for (int i=-count; i<=count; i++) x+=popcntx (i);
	return x;
}


__int64 runloopx64  (int count) {
	__int64 x=0;
	for (__int64 i=-count; i<=count; i++) x+=popcntx64 (i);
	return x;
}

#ifdef __INTEL_COMPILER

// No unrolling for Intel Compiler

__int64 runloop32_0  (int count) {
	int x=0;

   #pragma nounroll
	for (int i=-count; i<=count; i++) x+=__popcnt (i);
	return x;
}

#if defined WIN64 || defined _WIN64
__int64 runloop64_0  (int count) {
	__int64 x=0;
    #pragma nounroll
	for (int i=-count; i<=count; i++) x+=__popcnt64 (i);// 
	return x;
}
#endif

__int64 runloopx64_0  (int count) {
	__int64 x=0;
 #pragma nounroll
	for (_int64 i=-count; i<=count; i++) x+=popcntx64 (i);
	return x;
}
__int64 runloopx32_0  (int count) {
	int x=0;
  #pragma nounroll
	for (int i=-count; i<=count; i++) x+=popcntx (i);
	return x;
}

_int64 runloop32_0n  (int count) {
	int x=0;

   #pragma nounroll
   #pragma novector
	for (int i=-count; i<=count; i++) x+=__popcnt (i);
	return x;
}
#if defined WIN64 || defined _WIN64
__int64 runloop64_0n  (int count) {
	__int64 x=0;
    #pragma nounroll
	#pragma novector--
	for (int i=-count; i<=count; i++) x+=__popcnt64 (i);
	return x;
}
#endif

__int64 runloopx64_0n  (int count) {
	__int64 x=0;
 #pragma nounroll
 #pragma novector
	for (__int64 i=-count; i<=count; i++) x+=popcntx64 (i);
	return x;
}
__int64 runloopx32_0n  (int count) {
	int x=0;
  #pragma nounroll
  #pragma novector
	for (int i=-count; i<=count; i++) x+=popcntx (i);
	return x;
}

#endif
#endif



void bench (__int64 (*aloop)(int),char name[80]) { 
   volatile _int64 res=0;
   volatile int vn=N;
	LARGE_INTEGER qa,qe,qf;
	__int64 ta,te,td,tdmin;
	__int64 d,dmin;
	unsigned int id; 
	int reg[4];
  	tdmin=MAXLONGLONG;
	dmin =MAXLONGLONG;

	for(int i=0; i< 100; i++) {
    SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_TIME_CRITICAL);
	QueryPerformanceCounter (&qa);
	 __cpuid(reg,0);
	ta=__rdtsc ();
	res=(*aloop)(vn);
	 te=__rdtscp(&id);
	 __cpuid(reg,0);
	
	QueryPerformanceCounter (&qe);
	QueryPerformanceFrequency(&qf);
	d=qe.QuadPart-qa.QuadPart;    	
	td=te-ta;
	if (td< tdmin) tdmin=td; 
	if (d< dmin) dmin=d;
	//printf ("%4.2f Clocks, %f micros qf=%I64d\n",(double)td/(2*N+1), (double) d/(2*N+1)/qf.QuadPart*1e9,res);
	
	}
	printf_s ("%s %4.2f Clocks %f ns\n",name,(double)tdmin/(ops),(double)dmin/(ops)/qf.QuadPart*1e9 );

 }


void sethamming() {
	hamming[0]=0; hamming[1]=1; 
	for (int i=2; i<65536;i++) { 
		if (i&1) hamming[i]=hamming[i-1]+1; else hamming[i]=hamming[i >> 1];
	}
}

int _tmain(int argc, _TCHAR* argv[])
{   volatile __int64 res=0; 
    sethamming();
	srand(1);
	for (int i=0; i<8*N;i++) f.f8[i]=UCHAR(rand());

	SetThreadAffinityMask(GetCurrentThread(),0x1);
	_tprintf_s (_T("Benchprogramm %s\n"),argv[0]);
	printf_s ("__popcnt() und __popcnt64() aus stdlib.h, c't 4/13, as");
	printf_s ("popcntx() und popcntx64() sind Inline-Funktionen zum Vergleich\n");
	printf_s ("\n");
    printf_s ("Compilerinfo:\n");
	print_compilerinfo();
	printf_s ("\n");
	
	SetPriorityClass(GetCurrentProcess(),HIGH_PRIORITY_CLASS); 
	printf_s("Kern fuer Turbo Mode ein paar s hochfahren\n");   
	for (_int64 i=0; i< 3300000000; i++) res++;  

	printf_s("ok, Messung startet\n"); 
	bench(&runloop32,    "__popcnt()     default        :");
	bench(&runloopm32,   "mm_popcnt      default        :");

	bench(&runlooplut8,  "lut8()         default        :");

	bench(&runloop16,    "__popcnt16()   default        :");
	bench(&runlooplut16, "lut16()        default        :");
	bench(&runlooplut2x8,"2xlut8()       default        :");
	bench(&runlooplut4x8,"4xlut8()       default        :");

#if defined WIN64 || defined _WIN64	
	bench(&runloop64,    "__popcnt64     default        :");
	bench(&runloopm64,   "mm_popcnt64    default        :");
#endif

    bench(&runloopx32,   "popcntx        default        :");
	bench(&runloopy32,   "popcnty        default        :");
	bench(&runloopx64,   "popcntx64      default        :");

#if defined __INTEL_COMPILER
	bench(&runloop32_0,  "__popcnt    Unroll(0)         :");
	bench(&runloop32_0n, "__popcnt    Unroll(0) novector:");
	#if defined WIN64 || defined _WIN64
	bench(&runloop64_0,  "__popcnt64  Unroll(0)         :");
	bench(&runloop64_0n, "__popcnt64  Unroll(0) novector:");
   #endif 
	
    bench(&runloopx32_0, "popcntx     Unroll(0)         :");
	bench(&runloopx32_0n,"popcntx     Unroll(0) novector:");	
	bench(&runloopx64_0, "popcntx64   Unroll(0)         :");
    bench(&runloopx64_0n,"popcntx64   Unroll(0) novector:");
	
    
#endif

	SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_NORMAL);	
	return 0;
}

