### R code from vignette source 'ENCODEFig4Band4D.Rnw'

###################################################
### code chunk number 1: loadlibs
###################################################
library("ENCODEFig4Band4D")
library("randomForest")
library("earth")

data("TF-Model")


###################################################
### code chunk number 2: setup4B
###################################################
data = TF_binding_profile_160bin
cnum = nrow(data)/3
dat1 = data[(1:cnum)*3-1,]
dat2 = data[(1:cnum)*3,]

dim(dat1)
dat1 = dat1[21:60,]		## select 40 bins around TSS
dat2 = dat2[21:60,]
se = c("SYDHTFBS_K562B_YY1", "SYDHTFBS_K562_CJUN", 
       "SYDHTFBS_K562_STAT1.1", "SYDHTFBS_K562_USF2")
dat1 = dat1[,se]
dat2 = dat2[,se]

cnum = ncol(dat1)


###################################################
### code chunk number 3: mynam1
###################################################
mynam = sapply(strsplit(colnames(dat1), split = "_"), "[", 3)
mynam


###################################################
### code chunk number 4: mynam2
###################################################
mynam = c("YY1", "JUN", "STAT1", "USF2")


###################################################
### code chunk number 5: Main_Fig4B
###################################################
par(mfrow=c(2,2))
par(mar=c(2, 2, 0.5,0.5))
par(mgp=c(1.0, 0.2, 0))
par(tcl=-0.2)
par(lend=2)

for(k in 1:cnum)
{
  tmp = (-20:19)*100+50
  maxy = max(dat1[,k], dat2[,k])
  miny = min(dat1[,k], dat2[,k])	
  plot(tmp, as.numeric(dat1[,k]), log="y", ylim=c(miny, maxy), 
       xlab=mynam[k], ylab="Average Signal", 
       type="l", pch=20, lwd=2, cex.axis=0.7, cex.lab=0.9, col="red")
  lines(tmp, as.numeric(dat2[,k]), 
        type="l", pch=20, lwd=2, col="green")
  abline(v=0, lty=2)
  if(k==1)
    {
      mytext = c("HCP", "LCP")
      legend(500, 1.31, cex=0.6, legend=mytext, lwd=2, col=c("red", "green"), bty="n")	
    }
  
}


###################################################
### code chunk number 6: rawdata
###################################################
rawdata = TF_model_data


###################################################
### code chunk number 7: rf
###################################################
data = rawdata
dat1 = data[data[,1]==0,]
dat2 = data[data[,1]>0,]
dim(dat1)
dim(dat2)

dat1[,1] = "No"
dat2[,1] = "Yes"

se1 = sample(1:nrow(dat1), 1000)
se2 = sample(1:nrow(dat2), 1000)
tr = rbind(dat1[se1,], dat2[se2,])
te = rbind(dat1[-se1,], dat2[-se2,])

##
class.gen= row.names(tr)
tr[,1] = as.factor(tr[,1])
mm1 = randomForest(tr[,-1], tr[,1])
pre= predict(mm1,te[,-1])
res = table(pre, te[,1])

 
(res[1,2]+res[2,1])/sum(res)
pre= predict(mm1,te[,-1], type="prob")
res = pre
thr = (1:99)*0.01
yy =  xx =  rep(0, length(thr))
for(i in 1:length(thr))
{
    aa = sum(res[,1]>=thr[i] & te[,1]=="No")
    bb = sum(res[,1]<thr[i] & te[,1]=="No")
    cc = sum(res[,1]>=thr[i] & te[,1]=="Yes")
    dd = sum(res[,1]<thr[i] & te[,1]=="Yes")
    yy[i] = aa/(aa+bb)
    xx[i] = cc/(cc+dd)
}
xx = c(1, xx, 0)
yy = c(1, yy, 0)
tmp1 = tmp2 = rep(0,100)
for(i in 1:100)
{
      tmp1[i] = xx[i]-xx[i+1]
      tmp2[i] = (yy[i+1]+yy[i])/2	
}
myauc = sum(tmp1*tmp2)


###################################################
### code chunk number 8: pred
###################################################
data = rawdata
data = data[!row.names(data)%in%class.gen, ]
pre= predict(mm1,data[,-1])
sum(pre=="Yes")		# 

my0 = data[pre=="No", 1]
data = data[pre=="Yes", ]

dim(data)

data[,1] = log2(data[,1]+0.03)
se = sample(1:nrow(data), 2000)
tr = data[se,]
te = data[-se,]
mm2 = randomForest(tr[,-1], tr[,1])
pre= predict(mm2,te[,-1])
corr1 = cor(pre, te[,1])

rmse1 = sqrt(sum((pre-te[,1])^2)/length(pre))
cod1 = 1- sum((te[,1]-pre)^2)/sum((te[,1]-mean(te[,1]))^2)


###########################
xx= c(pre, rep(log2(0.03), length(my0)))
yy = c(te[,1], log2(my0+0.03) )
corr2 = cor(xx, yy)
rmse2 = sqrt(sum((xx-yy)^2)/length(xx))
cod2 = 1- sum((yy-xx)^2)/sum((yy-mean(yy))^2)

rxx = xx
ryy = yy


xx= rxx
yy = ryy
xnum = length(xx)*0.10
se = sample(1:length(xx), xnum)
xx = xx[se]
yy = yy[se]


###################################################
### code chunk number 9: Fig4D
###################################################
myFig = "Main_Fig4D.pdf"

pdf(file=myFig, height =8, width = 16, pointsize=9)
split.screen(c(1,2))

screen(1)
par(mar=c(5, 5, 5, 2) + 0.1)
plot(xx, yy, xlab="predicted expression (log2)", ylab="measured expression (log2)", main="CAGE PolyA+ K562 Whole Cell", cex.main=2, cex.lab=1.8, cex.axis=1.5, col="#0000ff22", pch=16, cex=2)
mylm = lm(yy~xx)
abline(mylm, col="red")
maxy = max(yy)
miny = min(yy)
maxx = max(xx)
minx = min(xx)
posx = minx + 0*(maxx-minx)/20
posy = maxy - (maxy-miny)/20
text(posx, posy, pos=4,  "Pearson's r=0.81; RMSE=2.57", cex=2)
posx = minx + 0*(maxx-minx)/20
posy = maxy- 1.9*(maxy-miny)/20
text(posx, posy,  pos=4, "Classification: AUC = 0.89", cex=1.4)
posx = minx + 0*(maxx-minx)/20
posy = maxy- 2.6*(maxy-miny)/20
text(posx, posy,  pos=4, "Rrgression: r = 0.62; RMSE = 3.06", cex=1.4)

split.screen( figs = c( 2, 1), screen = 2 )
screen(3)

par(mar=c(5, 6, 5, 2) + 0.1, lwd=2)
tmp = mm1$importance
tmp = tmp[,1]
tmp = sort(tmp, decreasing=T)
barplot(tmp, ylab="Classification\n(Mean Decreased GINI)", names.arg="",  main="Relative importance of TFs", cex.lab =1.5, cex.main=2)
par(xpd=T)
for(s in 1:length(tmp))
{
	posx = 1.5 + 1.2*(s-1)
	posy = -max(tmp)/40
	text(posx, posy, pos=2,  names(tmp)[s], srt=45, cex=0.8)	
}



screen(4)
par(mar=c(5, 6, 1, 2) + 0.1, lwd=2)
tmp = mm2$importance
tmp = tmp[,1]
tmp = sort(tmp, decreasing=T)
barplot(tmp, ylab="Regression\n(Increase of Node Purity)", names.arg="", cex.lab =1.5)
par(xpd=T)
for(s in 1:length(tmp))
{
	posx = 1.5 + 1.2*(s-1)
	posy = -max(tmp)/40
	text(posx, posy, pos=2,  names(tmp)[s], srt=45, cex=0.8)	
}


close.screen(all = TRUE)
dev.off()


