.packageName <- "RLMM"
Classify <- function(genotypefile="",regionsfile="",thetafile="",callrate=100)
{

md.classify <- function(theta.A,theta.B,m.AA,m.AB,m.BB,s.AA,s.AB,s.BB,callrate,threshold,debug)
{

theta <- cbind(theta.A,theta.B)
md.AA <- mahalanobis(theta,m.AA,s.AA)
md.AB <- mahalanobis(theta,m.AB,s.AB)
md.BB <- mahalanobis(theta,m.BB,s.BB)
geno <- array(-1,length(md.AA))
mind<- array(-1,length(md.AA))
res <- ""
for (i in 1:length(md.AA))
{
### Always make a call
mind[i]<- min (md.AA[i],md.AB[i],md.BB[i])
if (mind[i]==md.AA[i]) geno[i] <- "AA"
if (mind[i]==md.AB[i]) geno[i] <- "AB"
if (mind[i]==md.BB[i]) geno[i] <- "BB"
if ((callrate <100) & (mind[i]>threshold)) {geno[i] <-"NC"}
res <- paste(res,geno[i],round(mind[i],4),sep=" ")
} #for i loop

return(res)
} # end function md.classify

calc.meancov <-function(num.AA,num.AB,num.BB,M.AA,M.AB,M.BB,sigma.AA,sigma.AB,sigma.BB,B,al,BnB,Bal,CB,Cal, T.A,T.B, SAA, SAB, SBB,rate,thresh)
{

if (num.AA <= 5)
{
if (num.AB>1 & num.BB>1) {M.AA <- B%*%   t(t(c(M.AB,M.BB))) + t(t(al))}
}
if (num.AB <= 5)
{
if (num.AA>1 & num.BB>1) {M.AB <-BnB%*% t(t(c(M.AA,M.BB))) + t(t(Bal))}
}
if (num.BB <= 5)
{
if (num.AA>1 & num.AB>1) {M.BB <-CB%*%  t(t(c(M.AA,M.AB))) + t(t(Cal))}
}
if (num.BB <= 5 | num.AB <= 5 | num.AA <= 5 )
{S.AA <- SAA
 S.AB <- SAB
 S.BB <- SBB
}
else 
{
if (num.AA>5)
{
S.AA <- sigma.AA + .008		#Add constant
}
if (num.AB>5)
{
S.AB <- sigma.AB + .008
}
if (num.BB>5)
{
S.BB <- sigma.BB + .008
}
} # end else

res <- md.classify(T.A,T.B,M.AA,M.AB,M.BB,S.AA,S.AB,S.BB,rate,thresh,dflag)
return(res)
} #end function


###### START THE MAIN PROGRAM ###########

if(thetafile=="")
   stop("Please specify a filename for the thetafile parameter")

if(genotypefile=="")
   stop("Please specify a filename for the genotypefile parameter")

if(regionsfile=="")
   stop("Please specify a filename for the regionsfile parameter")


#Read the SNP names in the thetafile
snpfn <- read.table(thetafile,as.is=T)
snpfn <- snpfn[,1]

#Process call rates
rates <- seq(80,100,2)
m <- is.na(match(callrate,rates))
if (m ==TRUE ) {print("Setting Call Rate to 80%"); rateindex<-1; callrate<-80} 
else {status<-paste("Setting Call Rate to",callrate); print(status);which(callrate==rates)->rateindex}
thresholds<-c(2.606, 2.787,2.996,3.236,3.523,3.873,4.318,4.929,5.874,7.811,298.484)
thresh <- thresholds[rateindex]
#END - Process call rates

#BEGIN - Now read in regression coefficients 
#Do some processing with the grand covariance matrix - (from grandcov.out)

M <- read.table(regionsfile,nrows=1,as.is=T)
M <- as.numeric(M)
dat <- read.table(regionsfile,nrows=9,as.is=T,skip=2)
S <- matrix(NA,9,9)
for (i in 1:9) { S[i,] <- as.numeric(dat[i,]) }
SAA <- matrix(NA,2,2)
SAB <- matrix(NA,2,2)
SBB <- matrix(NA,2,2)
SAA[1,1] <- M[1]
SAA[2,2] <- M[2]
SAA[1,2] <- SAA[2,1] <- (M[3])*sqrt(SAA[1,1])*sqrt(SAA[2,2])
SAB[1,1] <- M[4]
SAB[2,2] <- M[5]
SAB[1,2] <- SAB[2,1] <- (M[6])*sqrt(SAB[1,1])*sqrt(SAB[2,2])
SBB[1,1] <- M[7]
SBB[2,2] <- M[8]
SBB[1,2] <- SBB[2,1] <- (M[9])*sqrt(SBB[1,1])*sqrt(SBB[2,2])


#Do some processing with the grandmean matrix - (from grandmean.out)
dat <- read.table(regionsfile,as.is=T,skip=18,nrows=14)
al1 <- dat[1,]
al2 <- dat[2,]

b11 <- dat[3,]
b12 <- dat[4,]
b13 <- dat[5,]
b14 <- dat[6,]
b21 <- dat[7,]
b22 <- dat[8,]
b23 <- dat[9,]
b24 <- dat[10,]

g11 <- dat[11,]
g12 <- dat[12,]
g21 <- dat[13,]
g22 <- dat[14,]

s1g2 <- matrix(c(g11,g12,g21,g22),byrow=T,nrow=2)
B <- matrix(c(b11,b12,b13,b14,b21,b22,b23,b24),byrow=T,nrow=2)
al <- c(al1,al2)

#Do some processing with the 2nd grandmean matrix - (from grandmean2.out)
Bdat <- read.table(regionsfile,as.is=T,skip=33,nrows=14)
Bal1 <- Bdat[1,]
Bal2 <- Bdat[2,]
Bb11 <- Bdat[3,]
Bb12 <- Bdat[4,]
Bb13 <- Bdat[5,]
Bb14 <- Bdat[6,]
Bb21 <- Bdat[7,]
Bb22 <- Bdat[8,]
Bb23 <- Bdat[9,]
Bb24 <- Bdat[10,]

Bg11 <- Bdat[11,]
Bg12 <- Bdat[12,]
Bg21 <- Bdat[13,]
Bg22 <- Bdat[14,]

Bs1g2 <- matrix(c(Bg11,Bg12,Bg21,Bg22),byrow=T,nrow=2)
BnB <- matrix(c(Bb11,Bb12,Bb13,Bb14,Bb21,Bb22,Bb23,Bb24),byrow=T,nrow=2)
Bal <- c(Bal1,Bal2)

#Do some processing with the 3rd grandmean matrix - (from grandmean3.out)
Cdat <- read.table(regionsfile,as.is=T,skip=48,nrows=14)
Cal1 <- Cdat[1,]
Cal2 <- Cdat[2,]

Cb11 <- Cdat[3,]
Cb12 <- Cdat[4,]
Cb13 <- Cdat[5,]
Cb14 <- Cdat[6,]
Cb21 <- Cdat[7,]
Cb22 <- Cdat[8,]
Cb23 <- Cdat[9,]
Cb24 <- Cdat[10,]
Cg11 <- Cdat[11,]
Cg12 <- Cdat[12,]
Cg21 <- Cdat[13,]
Cg22 <- Cdat[14,]

Cs1g2 <- matrix(c(Cg11,Cg12,Cg21,Cg22),byrow=T,nrow=2)
CB <- matrix(c(Cb11,Cb12,Cb13,Cb14,Cb21,Cb22,Cb23,Cb24),byrow=T,nrow=2)
Cal <- c(Cal1,Cal2)
#END - Now read in regression coefficients 

#BEGIN - READ THE FILES
regionfile.obj <- read.table(regionsfile,as.is=T,skip=63)
thetafile.obj <- read.table(thetafile,as.is=T)
print(dim(thetafile.obj))
#END - READ THE FILES

NUMSAMP <- (ncol(thetafile.obj)-1)/2	# Number of chips
NUMSNP <- nrow(regionfile.obj)		# Number of SNPs in regionsfile


# Match the SNPs in the thetafile to the SNPs in the regions file
snpid.thetafile<-thetafile.obj[,1]
print(snpid.thetafile)
snpid.regionfile<-regionfile.obj[,1]
m <- match(snpid.thetafile,snpid.regionfile)

# END - Match the SNPs in the thetafile to the SNPs in the regions file

#BEGIN - MAIN LOOP
for (r in 1:nrow(thetafile.obj)) # start looping over the SNPs in thetafile
{ 
discontinue <- FALSE

snpid <- thetafile.obj[r,1]
n <- m[r]	#corresponding index in the regions file
if (is.na(n)) {discontinue<-TRUE; error<-paste("Can't find ",snpid," in the regions file"); print(error)}
else {		#If match obtained for SNP in the regions file
num.AA <- regionfile.obj[n,2]
num.AB <- regionfile.obj[n,3]
num.BB <- regionfile.obj[n,4]
M.AA <- as.numeric(regionfile.obj[n,5:6])
M.AB <- as.numeric(regionfile.obj[n,7:8])
M.BB <- as.numeric(regionfile.obj[n,9:10])
S.AA <- as.numeric(regionfile.obj[n,11:13])
S.AB <- as.numeric(regionfile.obj[n,14:16])
S.BB <- as.numeric(regionfile.obj[n,17:19])
S.AA <- matrix(c(S.AA[1],S.AA[3],S.AA[3],S.AA[2]),nrow=2,byrow=T)
S.AB <- matrix(c(S.AB[1],S.AB[3],S.AB[3],S.AB[2]),nrow=2,byrow=T)
S.BB <- matrix(c(S.BB[1],S.BB[3],S.BB[3],S.BB[2]),nrow=2,byrow=T)


  #Exclude singletons or monomorphic SNPs
  zero.AA <- num.AA==0
  zero.AB <- num.AB==0
  zero.BB <- num.BB==0
  one.AA <- num.AA==1
  one.AB <- num.AB==1
  one.BB <- num.BB==1
 
 if ((zero.AA+zero.AB==2) | (zero.AB+zero.BB==2) | (zero.AA+zero.BB==2)) {discontinue <- TRUE}
 if ((zero.AA+one.AB==2) | (zero.AB+one.BB==2) | (zero.AA+one.BB==2)) {discontinue <- TRUE}
 if ((one.AA+zero.AB==2) | (one.AB+zero.BB==2) | (one.AA+zero.BB==2)) {discontinue <- TRUE}
  #END - Exclude singletons or monomorphic SNPs

} # END - If match obtained for SNP in the regions file

if (discontinue==0)
{
g <- array("-1",NUMSAMP)

THETA.A <- as.numeric(thetafile.obj[r,2:(NUMSAMP+1)])
THETA.B <- as.numeric(thetafile.obj[r,(NUMSAMP+2):(2*NUMSAMP+1)])


res<- calc.meancov(num.AA,num.AB,num.BB,M.AA,M.AB,M.BB,S.AA,S.AB,S.BB,B,al,BnB,Bal,CB,Cal, THETA.A,THETA.B, SAA, SAB, SBB,callrate,thresh)

mdist <- res
rlmm <- c(as.character(snpid),res)
write.table(t(rlmm),genotypefile,quote=F,sep=" ",row.names=F,col.names=F,append=T)
status <- paste("Processed",snpid)
print(status)
} #end if NOT discontinue

if (discontinue==1)	#when this SNP cannot be genotyped
{
res <- c("NC","NA")
res <- rep(res,NUMSAMP)
rlmm <- c(as.character(snpid),res)
write.table(t(rlmm),genotypefile,quote=F,sep=" ",row.names=F,col.names=F,append=T)
} #end if discontinue

} # end r loop of all the HapMap SNPs
#END  - MAIN LOOP

}

attr(Classify, "source") <- "See documentation or e-mail Nusrat Rabbee at nrabbee@post.harvard.edu"
create_Thetafile <- function(probefiledir = getwd(),start=1,end=-1,thetafile=""){

   ## This program creates allele summaries (theta_A and theta_B values) for
   ## each chip , each SNP. It uses Robust Linear Model with constraints 
   ## for the A probes and B probes separately, in order to produce the
   ## the theta_A and theta_B values
   ## --------- 
   ## Functions within create_Thetafile: RMA & write.theta
   ##
   ## BY: Nusrat Rabbee & Gary Wong
   #########################################################################


####### DEFINE THE FUNCTIONS #########
RMA <- function(A,B,flag=FALSE) 
{

 new.y.A <- as.vector(A)
 new.y.B <- as.vector(B)
 num.probes <- nrow(A)
 num.chips <- ncol(A)
 
 temp<- seq(1,num.chips,1)
 temp2<- rep(1,num.probes)
 C2 <- kronecker(temp,temp2)
 temp <- rep(1,num.chips)
 temp2 <- seq(1,num.probes,1)
 P2 <- kronecker(temp,temp2)
       
 aa.A <- rlm(new.y.A ~ as.factor(C2) + C(as.factor(P2),"contr.sum"),maxit=100)
 aa.B <- rlm(new.y.B ~ as.factor(C2) + C(as.factor(P2),"contr.sum"),maxit=100)
 int.A <- coefficients(aa.A)[1]
 int.B <- coefficients(aa.B)[1] 
 chips <- coefficients(aa.A)[2:(num.chips)]
 THETA.A <- c(int.A,int.A+chips)
 chips <- coefficients(aa.B)[2:(num.chips)]
 THETA.B <- c(int.B,int.B+chips)

 THETA.A <- as.vector(THETA.A)
 THETA.B <- as.vector(THETA.B)
 return(c(THETA.A,THETA.B))
}

write.theta <- function(babe,bits,num.samp,num.probes,thetafile)
{

snpid <- snpnames[babe]

print(snpid)
training.size <- length(bits)
idx <- babe

num.samp-> NUMSAMP
x40 <- matrix(num.probes,NUMSAMP)
st <- (idx-1)*num.probes + 1
fn <- (idx-1)*num.probes + num.probes 
x40 <- PROBEDAT[st:fn,]			#extract the probe data; used to be PROBEDAT


y<- x40[1:num.probes,]
y.A <- y[1:(num.probes/2),bits]
y.B <- y[(num.probes/2+1):(num.probes),bits]

aa <- RMA(y.A,y.B)	#Obtain theta for all the samples

THETA.A <- aa[1:(training.size)]
THETA.B <- aa[(1+training.size):(2*training.size)]

THETA.A <- round(THETA.A,4)
THETA.B <- round(THETA.B,4)

res <- c(as.character(snpid),as.character(THETA.A), as.character(THETA.B))
write.table(t(res),thetafile,row.names=F,col.names=F,append=T,sep=" ",quote=F)
return(res)
} # end function

# ==================================================================================
# Main
library(MASS)

if(thetafile=="")
   stop("Please specify a name for the thetafile parameter")

currentdir <- getwd()
print(probefiledir)
list.files(probefiledir)
setwd(probefiledir)
dirFiles <- list.files()
normfiles <- dirFiles[grep(".norm",dirFiles)]
anormfile <- normfiles[1]
fn <- read.table(normfiles[1],as.is=T)
snpnames <- fn[,1]
num.probes<-ncol(fn)-1

NUMSAMP <- length(normfiles)
print(paste("Total number of .norm files found in probefiledir:",NUMSAMP))

if(length(normfiles)==F)
   stop("The correct number of *.norm files is not in the directory")

if(grep(".norm",normfiles[1])==F)
   stop("There are no *.norm files in this directory")

fn <- read.table(normfiles[1],as.is=T)
fn <- fn[,1]
fn <- matrix(data=fn)

#creating PROBEDAT

NUMSNP <- length(snpnames)

print(paste("NUMSNP is", NUMSNP))
print(paste("NUMSAMP is", NUMSAMP))
print(paste("Num of probes are ", num.probes))

if(start<1 | start>NUMSNP) { start <- 1}
 
if(end==-1 | end>NUMSNP) { end <- NUMSNP}

if (start>end)
   {stop("start value is greater than the end value, please change")}

PROBEDAT <- matrix(-1,NUMSNP*(num.probes),NUMSAMP)
for (j in 1:NUMSAMP) # of normfiles
{
d <- paste(normfiles[j],sep="")
d <- as.character(normfiles[j])
print(paste("Processing ",d))
temp <- read.table(d,as.is=T)
temp <- temp[,-1]
PROBEDAT[,j] <- as.numeric(t(temp))
}


babe <- seq(start,end,1)

tr.bits <- seq(1,NUMSAMP,1)	#Get the theta's for all the samples together
setwd(currentdir)
RES <- apply(t(babe),2,write.theta,bits=tr.bits,num.samp=NUMSAMP,num.probes=num.probes,thetafile=thetafile)

rm(PROBEDAT)
#END WRITE_THETA function
}

attr(create_Thetafile, "source") <- "See documentation or e-mail Nusrat Rabbee at nrabbee@post.harvard.edu"
#--------------------------------------------------------------------#
#-- Creates *.norm files from *.raw files which                    --#
#-- has normalized intensity values for PMA and PMB using a pseudo --#
#-- normalization method by using the CQV vector                   --#
#-- By: Nusrat Rabbee & Gary Wong                                  --#
#--------------------------------------------------------------------#

normalize_Rawfiles <- function(cqvfile="", probefiledir = getwd()){  
   
   
   # CQV is a sorted column vector of normalized values
   # Xba.cqv is a text file
   
   print("Reading CQV")
   
   CQV <- read.table(cqvfile)
   CQV <- CQV[[1]]
   
   print("Reading CQV ...Done")
   
   if(length(CQV) == 0)
       stop("Cannot locate CQV within the directory")
   
   CQV <- sort(CQV)
    
# ------
# Read all file names in dir

   currentdir <- getwd()
   setwd(probefiledir)
   
   fn <- list.files()
   fn <- fn[grep(".raw",fn)]
   
   NUMSAMP <- length(fn) #number of *.raw files
   NUMSNP <- length(readLines(fn[1])) #take the length of the first file

   if(grep(".raw",fn[1])==F)
   stop("There are no *.raw files in this directory")
   
   fn <- data.frame(fn)
   fn <- t(fn)
   pma.idx <- seq(2,41,4) # index for 10 pma values
   pmb.idx <- seq(4,41,4) # index for 10 pmb values


   
   
       if((length(CQV)/20) != NUMSNP) {
          error <- paste("File",cqvfile,"is not the correct size")
          stop(error)
	}

   for (j in 1:length(fn)) # of *.raw 
   {
   if(length(readLines(fn[j])) != NUMSNP){
      stop(paste("# of SNPs is not equal to",NUMSNP,"in", fn[j]))    
      cat("The # of SNPs in the *.raw files is correct \n")
      }

   print("Processing File")
   print(as.character(fn[j]))
   
   dat <- read.table(fn[j],as.is=T)
   new <- dat[,c(pma.idx,pmb.idx)] # extract pma & pmb values
   CQVTEMP <- as.vector(t(new))
   NORM <- CQV[floor(rank(CQVTEMP))]
   
   Norm.RawFiles<-matrix(NORM, nrow=length(new[,1]),ncol=length(new[1,]),byrow=TRUE)
   Norm.RawFiles<-as.data.frame(Norm.RawFiles)
   Norm.RawFiles<-data.frame(cbind(dat[,1],Norm.RawFiles))

   write.table(Norm.RawFiles, file = paste(strsplit(fn[j],split=".raw"),"norm",sep="."), quote = FALSE, row.names=FALSE, col.names=FALSE)
   }  # end for loop
  
   setwd(currentdir)
}

attr(normalize_Rawfiles, "source") <- "See documentation or e-mail Nusrat Rabbee at nrabbee@post.harvard.edu"
plot_theta <- function(genotypefile="Xba.rlmm",thetafile="Xba.theta",Pick.Obj="FALSE",plotfile="plots.ps",snpsfile="snps.lst")
{

  if(Pick.Obj==FALSE){
    dat <- read.table(thetafile,as.is=T)
    geno <- read.table(genotypefile,as.is=T)
  }

NUMSAMP<- (ncol(geno)-1)/2
NUMSNP<- nrow(geno)

gn.idx <- seq(2,(2*NUMSAMP+1),2) #-- for Xba.rlmm

if(plotfile!=""){
postscript(plotfile)
}

par(pty="s")
par(mfrow=c(1,1))

babe <- read.table(snpsfile,as.is=T)

match(babe[,1],geno[,1])->idx
print("Matching at Index")
print(idx)

geno<- geno[,gn.idx]	#Extract only the genotype columns

if ( sum(is.na(idx)) == length(idx) ) #none of the requested SNPs are found
{
  print("Could not find the theta values for the SNPs requested for plotting ")
}
if ( sum(is.na(idx)) != length(idx) ) #some of the requested SNPs are found
{
 idx <- idx[!is.na(idx)]
 dat<-dat[idx,]		#Extract only the matched SNP rows
 geno<-geno[idx,]

 THETA.A <- dat[,2:(NUMSAMP+1)]
 THETA.B <- dat[,(NUMSAMP+2):(2*NUMSAMP+1)]

Ac <- array(-1,ncol(geno))
ec <- array(-1,ncol(geno))

for (i in 1:nrow(THETA.A))	#for each SNP in the list
{
g <- as.character(geno[i,])
ind.AA <- g=="AA"
ind.AB <- g=="AB"
ind.BB <- g=="BB"

num.AA <- sum(ind.AA)
num.AB <- sum(ind.AB)
num.BB <- sum(ind.BB)
T.A <- as.numeric(THETA.A[i,])
T.B <- as.numeric(THETA.B[i,])

#DO THE RLMM PLOT
#nc <- round(sum(n=="NC")/NUMSAMP * 100,2)
Ac[g=="AA"] <- 4
Ac[g=="AB"] <- 2
Ac[g=="BB"] <- 3
Ac[g=="NC"] <- 1
ec[g=="AA"] <-24
ec[g=="AB"] <-23
ec[g=="BB"] <-25
ec[g=="NC"] <- 1
label.A <- paste("allele A")
label.B <- paste("allele B")
snpid <- dat[i,1]
plot(T.A,T.B,col=Ac,pch=ec,main=paste(snpid," Allele Summary Plot",sep=" "),xlab=label.A,ylab=label.B,xlim=c(8,15),ylim=c(8,15))

legend (13,15,legend=c("AA","AB","BB","NC"),pch=c(24,23,25,1),col=c(4,2,3,1))
} # end for loop
} # end some of the requested SNPs are found

if(plotfile!=""){
   dev.off()
   }

}
