@echo off

if "%mm_exepath%"  =="" set mm_exepath=..\x64
if "%mm_exename%"  =="" set mm_exename=mmgen
if "%mm_runpars%"  =="" set mm_runpars=-t 1000 
if "%mm_DIM0%"     =="" set mm_DIM0=1024
if "%mm_DIM%"      =="" set mm_DIM=%mm_DIM0%
if "%mm_dimname%"  =="" set mm_dimname=DIM
if "%mm_dim0name%" =="" set mm_dim0name=DIM0


set mm_mtype=%1
if "%mm_mtype%"=="mm" goto MM_SHOW
if  %mm_mtype%== 20 goto MM_MKL 
if /I "%mm_mtype%"=="MKL" goto MM_MKL

if %mm_mtype% LSS 20 goto MM_IJK 
if exist %mm_mtype% goto MM_FILE
echo unknown command %mm_mtype%
goto :eof

:MM_IJK
if "%2"=="ijk" set mm_i=i& set mm_j=j& set mm_k=k
if "%2"=="ikj" set mm_i=i& set mm_j=k& set mm_k=j
if "%2"=="jik" set mm_i=j& set mm_j=i& set mm_k=k
if "%2"=="jki" set mm_i=j& set mm_j=k& set mm_k=i
if "%2"=="kji" set mm_i=k& set mm_j=j& set mm_k=i
if "%2"=="kij" set mm_i=k& set mm_j=i& set mm_k=j
set mm_vec=%3
set mm_comp=%4
if "%mm_tiling%"=="" set mm_tiling=1
if "%mm_setnull%"=="" set mm_setnull=setnull
if /I "%mm_openmp%"=="openmp" set mm_openmp=omp parallel for 
 
set mm_flags=%5 %6 %7 %8 %9
if /I not "%mm_openmp%"=="" set mm_flags=%mm_flags% /Qopenmp


if "%2"=="all" (
set mm_i=i& set mm_j=j& set mm_k=k& call :mm_construct & call :compile 
set mm_i=i& set mm_j=k& set mm_k=j& call :mm_construct & call :compile
set mm_i=j& set mm_j=i& set mm_k=k& call :mm_construct & call :compile
set mm_i=j& set mm_j=k& set mm_k=i& call :mm_construct & call :compile
set mm_i=k& set mm_j=j& set mm_k=i& call :mm_construct & call :compile
set mm_i=k& set mm_j=i& set mm_k=j& call :mm_construct & call :compile
)

if not "%2"=="all" call :mm_construct & call :compile

goto :exit

:MM_MKL
set mm_setnull=
set mm_comp=icl
set mm_flags=
if /I "%2"=="parallel" set mm_flags=/Qmkl:parallel
if /I "%2"=="sequential" set mm_flags=/Qmkl:sequential
if "%mm_flags%"=="" echo error mkl parallel or mkl sequential & goto :eof
set mm_flags=%mm_flags% /DMKL
echo char* seq="%mm_comp% %mm_flags%  %mm_dimname%=%mm_dim% %mm_dim0name%=%mm_dim0%"; > mm.h
type mmMKL.h >> mm.h
rem set mm_flags=%mm_flags% /openmp /I"%mklroot%"\include mkl_intel_lp64.lib mkl_core.lib mkl_intel_thread.lib
set mm_flags=%mm_flags% /openmp /I"%mklroot%"\include
call :compile
goto :eof
)

:MM_FILE
set mm_setnull=
set mm_comp=%2
set mm_flags=%3 %4 %5 %6 

echo char* seq="%mm_comp% %1 %mm_flags%  %mm_dimname%=%mm_dim% %mm_dim0name%=%mm_dim0%"; > mm.h
type %1>>mm.h

call :compile
goto :eof
)



:MM_SHOW 
 type mm.h
 call :run
 goto :exit

:mm_construct 
if not "%mm_noconstruct%"=="" goto :eof
if /I "%mm_setnull%"=="inloop" if not "%mm_k%"=="k" goto :eof 
echo #define MTYPE %mm_mtype% > mm.h
echo char* seq="%mm_comp% mm_mul_%mm_mtype%  %mm_flags% %mm_i%-%mm_j%-%mm_k% %mm_vec% %mm_dimname%=%mm_dim% %mm_dim0name%=%mm_dim0% %mm_setnull% tiling=%mm_tiling%"; >> mm.h

if %mm_mtype%==0 echo #define RUNPROC matrixmul()  >> mm.h
if %mm_mtype%==1 echo #define RUNPROC matrixmul(dim)  >> mm.h
if %mm_mtype%==2 echo #define RUNPROC matrixmul(a, b, c)  >> mm.h
if %mm_mtype%==3 echo #define RUNPROC matrixmul(a, b, c, dim)  >> mm.h
if %mm_mtype%==11 echo #define RUNPROC matrixmul(ap, bp, cp)  >> mm.h
if %mm_mtype%==12 echo #define RUNPROC matrixmul(ap, bp, cp, dim)  >> mm.h
if %mm_mtype%==13 echo #define RUNPROC matrixmul(ap, bp, cp, dim, dim0)  >> mm.h
if %mm_mtype%==14 echo #define RUNPROC matrixmul(ap, bp, cp, dim, dim0)  >> mm.h




if %mm_mtype%==0 echo void matrixmul() { >> mm.h
if %mm_mtype%==1 echo void matrixmul(int dim) { >> mm.h
if %mm_mtype%==2 echo void matrixmul(const mat a, const mat b, mat c) { >> mm.h
if %mm_mtype%==3 echo void matrixmul(const mat a, const mat b, mat c, int dim) { >> mm.h
if %mm_mtype%==11 echo void matrixmul(const double* a, const double*  b, double* c) { >> mm.h
if %mm_mtype%==12 echo void matrixmul(const double* a, const double*  b, double* c, int dim) { >> mm.h
if %mm_mtype%==13 echo void matrixmul(const double* a, const double*  b, double* c, int dim, int dim0) { >> mm.h
if %mm_mtype%==14 echo void matrixmul(const double* __restrict a, const double* __restrict b, double* __restrict c, int dim, int dim0) { >> mm.h


if /I "%mm_setnull%"=="mm_setnull" echo	setNull(); >> mm.h
if /I not "%mm_openmp%"=="" echo #pragma %mm_openmp% >>mm.h

:external_loop
if %mm_tiling% NEQ 1 echo	for (int %mm_j%x = 0; %mm_j%x ^< %mm_dimname%/%mm_tiling%; %mm_j%x++)  >> mm.h 
echo	for (int %mm_i% = 0; %mm_i% ^< %mm_dimname%; %mm_i%++) >>mm.h

:middle_loop
rem int k = n*kx; k< min(n*kx + n, dim); k++)
if %mm_tiling% NEQ 1 echo	 for (int %mm_j% = %mm_tiling%*%mm_j%x; %mm_j% ^< min(%mm_tiling%*(%mm_j%x+1),%mm_dimname%); %mm_j%++) { >> mm.h 
if %mm_tiling% EQU 1 echo	 for (int %mm_j% = 0; %mm_j% ^< %mm_dimname%; %mm_j%++) { >> mm.h 
if /I "%mm_setnull%"=="inloop" echo    c[i][j]=0.0; >> mm.h  
if /I "%mm_vec%"=="novec"  echo     #pragma loop(no_vector) >> mm.h
if /I "%mm_vec%"=="mm_vec" echo     #pragma simd >> mm.h

:inner_loop 
echo	  for (int %mm_k% = 0; %mm_k% ^< %mm_dimname%; %mm_k%++) {>> mm.h
if %mm_mtype% LSS 10 echo	 c[i][j] += a[i][k] * b[k][j]; >> mm.h
if %mm_mtype% GEQ 10 echo    c[i*%mm_dim0name%+j] +=a[i*%mm_dim0name%+k]*b[k*%mm_dim0name%+j]; >> mm.h

echo   } >> mm.h
echo  }  >> mm.h
echo }   >> mm.h
goto :eof 

:compile
if /I "%mm_setnull%"=="inloop" if not "%mm_k%"=="k" goto :eof 
del *.obj 2> nul
if exist %mm_exepath%\%mm_exename%.exe del %mm_exepath%\%mm_exename%.exe
del *.log 2> nul
if /I "%mm_comp%"=="icl" (
icl %mm_flags% /Ob0 /DDIM0=%mm_DIM0% /DDIM=%mm_DIM% /Dstaticarrays %mm_exename%.cpp -I ..\mmtest /Qvec-report1 -Fe%mm_exepath%\%mm_exename%.exe >out.log   2>err.log  
For /f "tokens=1,2,3,4* delims=:()" %%i in (err.log) do  (
  if "%%~nxj"=="mm.h" echo mm.h_%%k, %%m  
 )
)
if "%mm_comp%"=="cl" (
cl %mm_flags% /Ob0 /DDIM0=%mm_DIM0% /DDIM=%mm_DIM% /EHsc /Dstaticarrays %mm_exename%.cpp -I ..\mmtest /Qvec-report:2 /Qpar-report:2 -Fe%mm_exepath%\%mm_exename%.exe >out.log 2>err.log
For /f "tokens=1,2,3,4* delims=:()" %%i in (out.log) do  (
  if "%%~nxj"=="mm.h" echo mm.h_%%k, %%m  
 )
)
:run

%mm_exepath%\%mm_exename%.exe %mm_runpars%
goto :eof


:exit