## ----global_options, include=FALSE---------------------------------------
knitr::opts_chunk$set(warning = FALSE, message = FALSE, eval = TRUE, fig.width = 7, fig.height = 4.2)

## ----load_libraries, results='hide'--------------------------------------
library(RCAS)

## ----sample_data---------------------------------------------------------
library(RCAS)
data(queryRegions) #sample queryRegions in BED format()
data(gff)          #sample GFF file

## ----RCAS_import_data, eval = FALSE--------------------------------------
#  queryRegions <- importBed(filePath = <path to BED file>, sampleN = 10000)
#  gff <- importGtf(filePath = <path to GTF file>)

## ----queryGFF------------------------------------------------------------
overlaps <- as.data.table(queryGff(queryRegions = queryRegions, gffData = gff))

## ----query_gene_types----------------------------------------------------
biotype_col <- grep('gene_biotype', colnames(overlaps), value = T)
df <- overlaps[,length(unique(overlappingQuery)), by = biotype_col]
colnames(df) <- c("feature", "count")
df$percent <- round(df$count / length(queryRegions) * 100, 1)
df <- df[order(count, decreasing = TRUE)]
ggplot2::ggplot(df, aes(x = reorder(feature, -percent), y = percent)) + 
  geom_bar(stat = 'identity', aes(fill = feature)) + 
  geom_label(aes(y = percent + 0.5), label = df$count) + 
  labs(x = 'transcript feature', y = paste0('percent overlap (n = ', length(queryRegions), ')')) + 
  theme_bw(base_size = 14) + 
  theme(axis.text.x = element_text(angle = 90))

## ----getTxdbFeatures-----------------------------------------------------
txdbFeatures <- getTxdbFeaturesFromGRanges(gff)

## ----summarizeQueryRegions-----------------------------------------------
summary <- summarizeQueryRegions(queryRegions = queryRegions, 
                                 txdbFeatures = txdbFeatures)

df <- data.frame(summary)
df$percent <- round((df$count / length(queryRegions)), 3) * 100
df$feature <- rownames(df)
ggplot2::ggplot(df, aes(x = reorder(feature, -percent), y = percent)) + 
  geom_bar(stat = 'identity', aes(fill = feature)) + 
  geom_label(aes(y = percent + 3), label = df$count) + 
  labs(x = 'transcript feature', y = paste0('percent overlap (n = ', length(queryRegions), ')')) + 
  theme_bw(base_size = 14) + 
  theme(axis.text.x = element_text(angle = 90))

## ----getTargetedGenesTable-----------------------------------------------
dt <- getTargetedGenesTable(queryRegions = queryRegions, 
                           txdbFeatures = txdbFeatures)
dt <- dt[order(transcripts, decreasing = TRUE)]

knitr::kable(dt[1:10,])

## ----transcriptBoundaryCoverage------------------------------------------
cvgF <- getFeatureBoundaryCoverage(queryRegions = queryRegions, 
                                   featureCoords = txdbFeatures$transcripts, 
                                   flankSize = 1000, 
                                   boundaryType = 'fiveprime', 
                                   sampleN = 10000)
cvgT <- getFeatureBoundaryCoverage(queryRegions = queryRegions, 
                                   featureCoords = txdbFeatures$transcripts, 
                                   flankSize = 1000, 
                                   boundaryType = 'threeprime', 
                                   sampleN = 10000)

cvgF$boundary <- 'fiveprime'
cvgT$boundary <- 'threeprime'

df <- rbind(cvgF, cvgT)

ggplot2::ggplot(df, aes(x = bases, y = meanCoverage)) + 
  geom_ribbon(fill = 'lightgreen', 
              aes(ymin = meanCoverage - standardError * 1.96, 
                  ymax = meanCoverage + standardError * 1.96)) + 
 geom_line(color = 'black') + 
 facet_grid( ~ boundary) + theme_bw(base_size = 14) 


## ----coverageprofilelist, fig.height=6-----------------------------------
cvgList <- calculateCoverageProfileList(queryRegions = queryRegions, 
                                       targetRegionsList = txdbFeatures, 
                                       sampleN = 10000)

ggplot2::ggplot(cvgList, aes(x = bins, y = meanCoverage)) + 
  geom_ribbon(fill = 'lightgreen', 
              aes(ymin = meanCoverage - standardError * 1.96, 
                  ymax = meanCoverage + standardError * 1.96)) + 
 geom_line(color = 'black') + theme_bw(base_size = 14) +
 facet_wrap( ~ feature, ncol = 3) 

## ----motif_analysis------------------------------------------------------
motifResults <- runMotifRG(queryRegions = queryRegions, 
                           resizeN = 15, sampleN = 10000,
                           genomeVersion = 'hg19', 
                           motifN = 2, nCores = 2)

par(mfrow = c(1,2), mar = c(2,2,2,2))
for (i in 1:length(motifResults$motifs)) {
  motifPattern <- motifResults$motifs[[i]]@pattern
  motifRG::plotMotif(match = motifResults$motifs[[i]]@match$pattern, 
                     main = paste0('Motif-',i,': ',motifPattern),
                     entropy = TRUE)
}

## ----motif_analysis_table------------------------------------------------
summary <- getMotifSummaryTable(motifResults)
knitr::kable(summary)

## ----GO analysis---------------------------------------------------------

#get all genes from the GTF data
backgroundGenes <- unique(gff$gene_id)
#get genes that overlap query regions
targetedGenes <- unique(overlaps$gene_id)

#run TopGO
goBP <- runTopGO(ontology = 'BP', 
                      species = 'human', 
                      backgroundGenes = backgroundGenes, 
                      targetedGenes = targetedGenes)

goBP <- goBP[order(goBP$foldEnrichment, decreasing = TRUE),]
rownames(goBP) <- goBP$GO.ID
goBP <- subset(goBP, select = -c(Annotated,classicFisher, bh, GO.ID))

knitr::kable(goBP[1:10,])

## ----msigdb_analysis-----------------------------------------------------
#geneSets <- parseMsigdb(< path to msigdbFile>)
data(geneSets)
resultsGSEA <- runGSEA(geneSetList = geneSets,
                       backgroundGenes = backgroundGenes, 
                       targetedGenes = targetedGenes)

knitr::kable(x = resultsGSEA[1:10,])

