.packageName <- "harbChIP"


setClass("upstreamSeqs",
 representation(seqs="environment",
		chrom="environment",
		revComp="environment",
		type="environment",
		organism="character",
		provenance="ANY"))

#library(harbChIP)
#data(harbChIP)
#data(sceUpstr)

#1 generate and investigate hexamer tables

#library(Biostrings)

#us1 = getUpstream("YAL001C", sceUpstr)[[1]]
#hus1 = views(us1, start=1:495, end=6:500)
#hus1
#sort(table(as.character(hus1)), decr=TRUE)[1:8]

# how many unique hexamers are found in upstream
# regions of YAL001C and YAL002W?

# how many unique hexamers are present in YAL002W but
# absent in YAL001C?  verify that "GGAATC" meets this
# condition

#2 test for independence of heptamer occupancy and
# TF binding intensity for a given heptamer and TF
# example GGCGCTA, SNT2

#myhep = "GGCGCTA"
#countPattern( myhep, getUpstream("YAL001C", sceUpstr)[[1]] )
chkAllUS = function(patt, upstr, struc=sceUpstr) {
 orfs = keys(struc)
 allu = lapply( orfs, function(x) getUpstream(x, struc)[[1]] )
 names(allu) = orfs
 occ = sapply(allu, function(x) countPattern(patt, x))
 names(occ) = orfs
 occ
}

#UShep = chkAllUS( myhep, sceUpstr )
#summary(UShep)
#BRAT.snt2 = exprs(harbChIP)[,"SNT2"]
#common = intersect(names(UShep), names(BRAT.snt2))
#UShep = UShep[common]
#BRAT.snt2 = BRAT.snt2[common]
#summary(BRAT.snt2)
#hasHep = common %in% names(UShep[UShep>0])
#hiBind = common %in% names(na.omit(BRAT.snt2[BRAT.snt2>2]))
#table(hasHep, hiBind)
#fisher.test(table(hasHep, hiBind))
   
#3 write the software to test for enrichment of a given pattern
# in intergenic regions to which a given TF binds strongly

chkMotif4TF = function(motif, TF, chset, upstr, bthresh=2, countthresh=0) {
 if (!(TF %in% sampleNames(chset))) stop("TF not found in sampleNames of chset")
 cat("generating motif counts ...\n")
 allus = chkAllUS( motif, upstr )
 cat("done.\n")
 bvec = exprs(chset)[,TF]
 common = intersect(names(allus), names(bvec))
 allus = allus[common]
 bvec = bvec[common]
 hasMotif = common %in% names(allus[allus>countthresh])
 hiBind = common %in% names(na.omit(bvec[bvec > bthresh]))
 tt = fisher.test( table(hasMotif, hiBind) )
 tab = table(hasMotif, hiBind)
 ca = match.call()
 list(call=ca, tab=tab, test=tt)
}
 

setMethod("show", "upstreamSeqs", function (object) 
{
#    cat("upstreamSeqs instance, organism ", organism(object),  Oct 07 conflict
#       with annotate::organism, eventually need namespace
    cat("upstreamSeqs instance, organism ", object@organism, 
        "\n")
    cat("There are ", tmpl <- length(seqs(object)), " entries\n")
    cat("first keys: \n")
    print(keys(object)[1:min(5, tmpl)])
#    cat(sum(unlist(as.list(isRevcomp(object)))), " sequences are reverse complement\n")
})

setGeneric("keys", function(x) standardGeneric("keys"))
setMethod("keys", "upstreamSeqs", function(x) ls(seqs(x)))
setGeneric("seqs", function(x) standardGeneric("seqs"))
setMethod("seqs", "upstreamSeqs", function(x) x@seqs)
setGeneric("organism", function(x) standardGeneric("organism"))
setMethod("organism", "upstreamSeqs", function(x) x@organism)

getUpstream = function (orfs, upstrob) 
{
    mget(orfs, seqs(upstrob))
}

setGeneric("Nmers", function(n, orf, usobj) standardGeneric("Nmers"))
setMethod("Nmers", c("numeric", "character", "upstreamSeqs"), function(n, orf, usobj) {
 if (length(orf)>1) stop("need single orf name")
 if (length(n)>1) stop("n must be numeric length 1")
 bs = getUpstream(orf, usobj)[[1]]
 views(bs, start=1:(Biostrings::nchar(bs)-n+1), end=n:(Biostrings::nchar(bs)))
})
 
allhex = function(orf, usobj) Nmers(6, orf, usobj)

buildUpstreamSeqs2 = function (fastaRead, organism="sce", provenance="harmen") 
{
# apply to a Biostrings::readFASTA object
    norf = length(fastaRead)
    cat("starting transformation to DNAString...")
    bstringlist = lapply(fastaRead, function(z) new("DNAString", 
        z$seq))
    cat("done.")
    fastaDesc = sapply(fastaRead, function(x) x$desc)
    orf = gsub(" .*", "", fastaDesc)
    orfnames = substr(orf, 2, nchar(orf))
    chr = gsub(".*Chr ", "", fastaDesc)
    chromvec = gsub(" .*$", "", chr)
    seqs = new.env(hash = TRUE)
    chrom = new.env(hash = TRUE)
    revComp = new.env(hash = TRUE)
    type = new.env(hash = TRUE)
    for (i in 1:norf) {
        if (i%%100 == 0) 
            cat(i)
        assign(orfnames[i], bstringlist[[i]], seqs)
        assign(orfnames[i], chromvec[i], chrom)
    }
    new("upstreamSeqs", seqs = seqs, chrom = chrom, organism = organism, 
        provenance = provenance)
}

