// absbench.cpp : Definiert den Einstiegspunkt fr die Konsolenanwendung.
//

#include "stdafx.h"
#include <Windows.h>
#include <intrin.h> 
#include <emmintrin.h>

double BenchPageDoubleSSE (PVOID userptr, int pages, int maxr) {
#define CLSIZE 64 //Cacheline 64
	__m128d sum, * a; 
	int reg[4];
	volatile int dummy1;
	unsigned int id;
	unsigned long long t0,t1, tstart, tend; 
	unsigned long long diff,mindiff=MAXLONGLONG; 
	LARGE_INTEGER qe,qa,qf; 

	SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_TIME_CRITICAL);
	QueryPerformanceCounter(&qa); 
		__cpuid(reg,0);
		tstart=__rdtsc();
	for (int r=0; r< maxr;r++) {
		
		sum=_mm_setzero_pd();
		//flush Cachelines 
		char* ap=(char*)userptr; 
		_mm_mfence();

		for (int cl=0;cl<4096*pages/CLSIZE; cl++) 
		{_mm_clflush(ap); ap+=CLSIZE;}
		_mm_mfence();
		a=(__m128d*)userptr;

		__cpuid(reg,0);
		t0=__rdtsc();
		//****** Messcode ************
		while (a< (__m128d*) ap)  
			sum=_mm_add_pd (sum,*a++);	
		//****************************
		_mm_mfence(); 
		t1=__rdtscp(&id);
		__cpuid(reg,0);
		//Compiler austricksen 
		dummy1=_mm_cvtsd_si32(sum); 
		if (r<3) continue; //wg. Warmlaufen 
		diff=t1-t0;
		if (diff < mindiff) mindiff =diff; 
		
	}
	   tend=__rdtscp(&id);
		__cpuid(reg,0);
		QueryPerformanceCounter(&qe); 
		QueryPerformanceFrequency(&qf); 
	SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_NORMAL);	
	double clock= (double)(tend-tstart)/(double) (qe.QuadPart-qa.QuadPart) *(double) qf.QuadPart;
	return ((double)mindiff)/clock;
}



int _tmain(int argc, _TCHAR* argv[]){   
	volatile __int64 res=0; 
	int CPUInfo[4];
	int cpu=0;
	int pages=1; 
	bool skipwarmup=false;
 int para=1;
 while (para < argc) {
	if (_tcscmp(argv[para],L"-c")==0) cpu=_tstoi(argv[++para]);
	if (_tcscmp(argv[para],L"-p")==0) pages=_tstoi(argv[++para]);
	if (_tcscmp(argv[para],L"-skip")==0) skipwarmup=true;
	para++;
 }


SetThreadAffinityMask(GetCurrentThread(),1LL << cpu);
_tprintf_s (_T("Benchprogramm %s\n"),argv[0]);
_tprintf_s (_T("Beispielprogramm rdtscp, clflush und mfence c't 17/13\n"));
_tprintf_s (_T("Eingabe : -c cpu, default cpu=0, maximal bis cpu=64\n")); 
_tprintf_s (_T("Eingabe : -p pages, default pages=1\n")); 
_tprintf_s (_T("Messung der Speicherbandbreite 1 Page von cpu %d auf alle Knoten\n"),cpu);
_tprintf_s (_T("\n"));
__cpuid(CPUInfo,0x80000001);
bool bRDTSCP = (CPUInfo[3] & 0x8000000) || false;
if (bRDTSCP) _tprintf_s (_T("ok, CPU supports RDTSCP\n"));
else { _tprintf_s (_T("oops,this CPU doesn't support RDTSCP\n")); exit(1); }

__cpuid(CPUInfo,1);
int nCLFLUSHcachelinesize = ((CPUInfo[1] >> 8) & 0xff) * 8;
if (nCLFLUSHcachelinesize>0) _tprintf_s (_T("ok, CPU supports CLFLUSH with size %d\n"),nCLFLUSHcachelinesize);
else { _tprintf_s (_T("oops,this CPU doesn't support CLFLUSH\n")); exit (1); }

SetPriorityClass(GetCurrentProcess(),HIGH_PRIORITY_CLASS); 
if (!skipwarmup) {
printf_s("Kern fuer Turbo Mode ein paar s hochfahren\n");   
for (_int64 i=0; i< 3300000000; i++) res++;
}
printf_s("Messung startet\n"); 

int AllocationSize=4096*pages; 
ULONG HighestNodeNumber=0;
if (!GetNumaHighestNodeNumber (&HighestNodeNumber)){
	_tprintf (_T("GetNumaHighestNodeNumber failed: %d\n"), GetLastError());
}

for (int NodeNumber=0; NodeNumber <=HighestNodeNumber; NodeNumber++) {
	PCHAR Buffer = (PCHAR)VirtualAllocExNuma(
		GetCurrentProcess(),
		NULL,
		AllocationSize,
		MEM_RESERVE | MEM_COMMIT,
		PAGE_READWRITE,
		NodeNumber
		);
	if (Buffer==NULL)  {
		_tprintf (_T("VirtualAllocExNuma failed: %d\n"), GetLastError()); exit(1) ; 
	}

	double tmin=BenchPageDoubleSSE (Buffer, pages,100);
	double bandwidth=AllocationSize/tmin;  
	_tprintf_s(_T("%d Pages, Single-Thread-Zugriff cpu (%d) auf Node (%d):%12.3lf GByte/s\n"),pages,cpu, NodeNumber, bandwidth/1e9);  
}


return 0;
}

