// mmtest.cpp : Definiert den Einstiegspunkt fr die Konsolenanwendung.
//

#include "stdafx.h"
#include <intrin.h> 
#include <Windows.h> 
#include <string>
#include <math.h>
#include <intrin.h>
#include <xmmintrin.h>
#ifdef MKL
#include <mkl.h>
#endif 
#include "compilerinfo.h"

#ifndef DIM0 
#define DIM0 1024
#endif

#ifndef DIM 
#define DIM DIM0
#endif


#ifndef NOMSR
#include "mymsr.h"
#endif


bool verbose = false;
double* ap;
double* bp;
double* cp;
int dim = DIM;
int dim0 = DIM0;
SYSTEM_INFO SystemInfo;
DWORD_PTR ProcessAffinityMask, SystemAffinityMask;

//#define staticarrays

#ifdef staticarrays 
typedef double mat[DIM0][DIM0];
__declspec(align(128)) mat a;
__declspec(align(128)) mat b;
__declspec(align(128)) mat c;

#endif

void fill(double *a, double *b, double *c, int dim, int dim0) {

	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++) {
			a[i*dim0 + j] = i*j;
			b[i*dim0 + j] = i*j;
			c[i*dim0 + j] = 0;
		}
}

double sumup(double * c, int dim, int dim0) {
	double r = 0.0;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++) {
			r += c[i*dim0 + j];
		}
	return r;
}

void setNull() {
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			cp[i*dim0+j] = 0.0;
}

#include "mm.h"

bool run(int rep, long long &clocks, double &time, double &afreq) {
	int reg[4] = { -1 };
	unsigned int id;
	unsigned long long t0, t1;
	LARGE_INTEGER Qa, Qe, Qf;
	bool result = true;
	SetThreadAffinityMask(GetCurrentThread(), 1);
#ifndef NOMSR
	uint64  CLK_UNHALTED_REFa = 0;
	uint64  CLK_UNHALTED_THREADa = 0;
	uint64  CLK_UNHALTED_REFe = 0;
	uint64  CLK_UNHALTED_THREADe = 0;
	readMSR(CPU_CLK_UNHALTED_REF_ADDR, &CLK_UNHALTED_REFa);
	readMSR(CPU_CLK_UNHALTED_THREAD_ADDR, &CLK_UNHALTED_THREADa);

#endif 

	QueryPerformanceCounter(&Qa);
	t0 = __rdtsc();
	__cpuid(reg, 1);
	SetThreadAffinityMask(GetCurrentThread(),ProcessAffinityMask); // running on all Process cores 
	
	for (int r = 0; r < rep; r++) RUNPROC;

	SetThreadAffinityMask(GetCurrentThread(), 1);
	t1 = __rdtscp(&id);
	__cpuid(reg, 0);
	QueryPerformanceCounter(&Qe);
	QueryPerformanceFrequency(&Qf);
#ifndef NOMSR
	readMSR(CPU_CLK_UNHALTED_REF_ADDR, &CLK_UNHALTED_REFe);
	readMSR(CPU_CLK_UNHALTED_THREAD_ADDR, &CLK_UNHALTED_THREADe);
	afreq = (double)(CLK_UNHALTED_THREADe - CLK_UNHALTED_THREADa) / (double)(CLK_UNHALTED_REFe - CLK_UNHALTED_REFa);
	if (verbose) printf("CLK unhalted %I64d %I64d ref=%I64d %I64d\n", CLK_UNHALTED_THREADe, CLK_UNHALTED_THREADa,
		CLK_UNHALTED_REFe, CLK_UNHALTED_REFa);

#else
	afreq = 1.4;
#endif
	clocks = (t1 - t0);
	time = (double)(Qe.QuadPart - Qa.QuadPart) / (double)Qf.QuadPart;
	return result;
}


int _tmain(int argc, _TCHAR* argv[])
{

	volatile __int64 dummy = 0;
	bool skip = false;
	int minsecs = 10000; // 1000 ms 
	int rep;
	long long clocks;
	double secs;
	double afreq;
	
	
	for (int p = 1; p <argc; p++) {
		if (_tcscmp(argv[p], _TEXT("-t")) == 0)   { minsecs = _tstoi(argv[++p]); continue; }
		if (_tcscmp(argv[p], _TEXT("-skip")) == 0) skip = true;
		if (_tcscmp(argv[p], _TEXT("-v")) == 0)    verbose = true;
		if (_tcscmp(argv[p], _TEXT("-d")) == 0)   { dim = _tstoi(argv[++p]); continue; }
	    if (_tcscmp(argv[p], _TEXT("-d0")) == 0)  { dim0 = _tstoi(argv[++p]); continue; }

	}

	GetSystemInfo(&SystemInfo);
	GetProcessAffinityMask(GetCurrentProcess(), &ProcessAffinityMask, &SystemAffinityMask);
	if (verbose) print_compilerinfo();
	//#define showmatrices
	
	SetThreadAffinityMask(GetCurrentThread(),1);
#ifndef NOMSR 
	if (!InitDrv()) {
		printf(" no access to msr-driver, start with admin rights\n");
		return 1;
		EnableFixCtrl(); // fr Core 0 
	}
#endif;
	if (verbose) printf("Hochfahren\n");
#ifdef _DEBUG 
	skip = true;
#endif
	if (!skip) for (__int64 i = 0; i < 3000000000; i++) dummy++;
#ifdef _SSE3 
	printf("Option=SSE3\n");
#endif
#ifdef staticarrays 
	ap = *a; // = (double*) &a 
	bp = *b; // = (double*) &b
	cp = *c; // = (double*) &c	
#else
	ap = (double*)_aligned_malloc(dim0*dim0*sizeof(double), 128);
	bp = (double*)_aligned_malloc(dim0*dim0*sizeof(double), 128);
	cp = (double*)_aligned_malloc(dim0*dim0*sizeof(double), 128);
#endif

	int size = dim*dim * 8;
	if (verbose) {

		printf("Messung mit DIM=%d, leading DIM=%d, Memory/Array=", dim, dim0);
		if (size < 1024) printf(" %d B ", size);
		else if (size < 1024 * 1024) printf(" %d KB ", size / 1024);
		else if (size < 1024 * 1024 * 1024) printf(" %d MB", size / (1024 * 1024));
		printf("\n");
	}
					
			fill(ap, bp, cp, dim, dim0);
			rep = 1;

			if (!run(rep, clocks, secs, afreq)) return 1;
			double mults = (double)dim*(double)dim*(double)dim;
			double adds = (double)(dim - 1)*(double)dim*(double)dim;
#ifdef NOCHECK 
			bool ok=true; 
#else
			double s = sumup(cp, dim, dim0);
			double d = (dim*(dim - 1)) / 2;
			double sx = (d*d*d*(2 * dim - 1)) / 3;
			double eps = (1 - s / sx);
			bool ok = abs(eps) < (1e-18)*mults;
			if (verbose) printf("res=%f ref=%f diff=%f\n", s, sx, eps);
#endif
			if (ok) {
				if (secs < 0.1) run(rep, clocks, secs, afreq);
				if (1000 * secs < minsecs) {
					rep = (int)(minsecs / (1000 * secs) + 0.5);
					if (rep == 0) rep = 1;
					run(rep, clocks, secs, afreq);
				}
				if (1100 * secs < minsecs) {
					rep *= 2;
					run(rep, clocks, secs, afreq);
				}
			}

#ifdef OPS 
			double ops = OPS;
#else
			double ops = (mults+adds);
#endif

			if (ok) {
				double currfreq = (double)clocks / secs*afreq;
				printf("%s\n", seq);
				printf("%7.3lf s", secs);
				printf("%7d reps", rep);
				if (secs/rep > 0.1) printf("%7.3lf s/rep", secs / rep);
				else printf("%7.3lf ms/rep ", 1000*secs / rep);
				printf("%6.2lf GF/s =", (ops*rep) / secs / 1e9);
				printf(" %6.3lf Flops/c", (ops*rep) / ((double)clocks*afreq));
				printf(" @%1.0lf MHz\n", currfreq / 1e6);
			}
			else printf(" ## not ok");
			printf("\n");
#ifdef showmatrices 
			printf("\n");
			for (int i = 0; i <dim; i++) {
				for (int j = 0; j <dim; j++) printf("%d ", (int)a[i][j]);
				printf("\n");
			}
			printf("\n");
			for (int i = 0; i <dim; i++) {
				for (int j = 0; j <dim; j++) printf("%d ", (int)c[i][j]);
				printf("\n");
			}
#endif

		
	


	return 0;
}

