
## ----echo=FALSE----------------------------------------------------------
suppressPackageStartupMessages(library(VariantAnnotation))
suppressPackageStartupMessages(library(cgdv17))
suppressPackageStartupMessages(library(org.Hs.eg.db))
suppressPackageStartupMessages(library(TxDb.Hsapiens.UCSC.hg19.knownGene))
suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg19))
suppressPackageStartupMessages(library(PolyPhen.Hsapiens.dbSNP131))


## ------------------------------------------------------------------------
library(VariantAnnotation)
library(cgdv17)
file <- system.file("vcf", "NA06985_17.vcf.gz", package = "cgdv17")
 
## Explore the file header with scanVcfHeader
hdr <- scanVcfHeader(file)
 
info(hdr) 
 
geno(hdr) 


## ------------------------------------------------------------------------
## get entrez ids from gene symbols
library(org.Hs.eg.db)
genesym <- c("TRPV1", "TRPV2", "TRPV3")
geneid <- select(org.Hs.eg.db, keys=genesym, keytype="SYMBOL",
                 cols="ENTREZID")
geneid


## ------------------------------------------------------------------------
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txbygene = transcriptsBy(txdb, "gene")


## ------------------------------------------------------------------------
tx_chr17 <- keepSeqlevels(txbygene, "chr17")
tx_17 <- renameSeqlevels(tx_chr17, c(chr17="17"))

## Create the gene ranges for the TRPV genes
rngs <- lapply(geneid$ENTREZID, 
               function(id) 
                   range(tx_17[names(tx_17) %in% id]))
gnrng <- unlist(do.call(c, rngs), use.names=FALSE)
names(gnrng) <- geneid$SYMBOL 


## ------------------------------------------------------------------------
param <- ScanVcfParam(which = gnrng, info = "DP", geno = c("GT", "cPd"))
param
 
## Extract the TRPV ranges from the VCF file 
vcf <- readVcf(file, "hg19", param)
## Inspect the VCF object with the 'fixed', 'info' and 'geno' accessors
vcf
 
head(fixed(vcf))

geno(vcf)


## ------------------------------------------------------------------------
seqlevels(vcf)
head(seqlevels(txdb))
## seqlevels do not match
intersect(seqlevels(vcf), seqlevels(txdb))
vcf_mod <- renameSeqlevels(vcf, c("17"="chr17"))
## seqlevels now match
intersect(seqlevels(vcf_mod), seqlevels(txdb))
 
## Use the 'region' argument to define the region
## of interest. See ?locateVariants for details.
cds <- locateVariants(vcf_mod, txdb, CodingVariants())
five <- locateVariants(vcf_mod, txdb, FiveUTRVariants())
splice <- locateVariants(vcf_mod, txdb, SpliceSiteVariants())
intron <- locateVariants(vcf_mod, txdb, IntronVariants())
all <- locateVariants(vcf_mod, txdb, AllVariants())


## ------------------------------------------------------------------------
## Did any variants match more than one gene
table(sapply(split(values(all)[["GENEID"]], values(all)[["QUERYID"]]), 
        function(x)
            length(unique(x)) > 1))
 
## Summarize the number of variants by gene
idx <- sapply(split(values(all)[["QUERYID"]], values(all)[["GENEID"]]), 
         unique)
sapply(idx, length)
 
## Summarize variant location by gene
sapply(names(idx), 
    function(nm) {
        d <- all[values(all)[["GENEID"]] %in% nm, c("QUERYID", "LOCATION")]
        table(values(d)[["LOCATION"]][duplicated(d) == FALSE])
    })


## ------------------------------------------------------------------------
library(BSgenome.Hsapiens.UCSC.hg19)
aa <- predictCoding(vcf_mod, txdb, Hsapiens)


## ------------------------------------------------------------------------
## Did any variants match more than one gene
table(sapply(split(values(aa)[["GENEID"]], values(aa)[["QUERYID"]]), 
        function(x)
            length(unique(x)) > 1))

## Summarize the number of variants by gene
idx <- sapply(split(values(aa)[["QUERYID"]], values(aa)[["GENEID"]], 
              drop=TRUE), unique)
sapply(idx, length)

## Summarize variant consequence by gene
sapply(names(idx), 
       function(nm) {
           d <- aa[values(aa)[["GENEID"]] %in% nm, c("QUERYID","CONSEQUENCE")]
           table(values(d)[["CONSEQUENCE"]][duplicated(d) == FALSE])
       })


## ------------------------------------------------------------------------
## Load the PolyPhen package and explore the available keys and columns
library(PolyPhen.Hsapiens.dbSNP131)
keys <- keys(PolyPhen.Hsapiens.dbSNP131)
cols <- cols(PolyPhen.Hsapiens.dbSNP131)
## column descriptions are found at ?PolyPhenDbColumns
cols(PolyPhen.Hsapiens.dbSNP131)

## Get the rsids for the non-synonymous variants from the
## predictCoding results
rsid <- unique(names(aa)[values(aa)[["CONSEQUENCE"]] == "nonsynonymous"]) 

## Retrieve predictions for non-synonymous variants. Two of the six variants 
## are found in the PolyPhen database. 
select(PolyPhen.Hsapiens.dbSNP131, keys=rsid, 
       cols=c("AA1", "AA2", "PREDICTION"))


## ----eval=FALSE----------------------------------------------------------
## library(BiocInstaller)
## biocLite("VariantAnnotation")


## ----eval=FALSE----------------------------------------------------------
## library(VariantAnnotation)


## ----eval=FALSE----------------------------------------------------------
## browseVignettes(package="VariantAnnotation")


## ----eval=FALSE----------------------------------------------------------
## help.start()


## ------------------------------------------------------------------------
sessionInfo()


