## ----knitr-options, echo=FALSE, message=FALSE, warning=FALSE-------------
## To render an HTML version that works nicely with github and web pages, do:
## rmarkdown::render("vignettes/vignette.Rmd", "all")
library(knitr)
opts_chunk$set(fig.align = 'center', fig.width = 6, fig.height = 5, dev = 'png')
library(ggplot2)
theme_set(theme_bw(12))

## ----quickstart-load-data, message=FALSE, warning=FALSE------------------
library(scater, quietly = TRUE)
data("sc_example_counts")
data("sc_example_cell_info")

## ----quickstart-make-sceset, results='hide'------------------------------
pd <- new("AnnotatedDataFrame", data = sc_example_cell_info)
rownames(pd) <- pd$Cell
example_sceset <- newSCESet(countData = sc_example_counts, phenoData = pd)

## ----filter-no-exprs-----------------------------------------------------
keep_feature <- rowSums(exprs(example_sceset) > 0) > 0
example_sceset <- example_sceset[keep_feature,]

## ----quick-start-calc-qc-metrics, eval=FALSE-----------------------------
#  example_sceset <- calculateQCMetrics(example_sceset, feature_controls = 1:40)

## ----quick-start-gui, eval=FALSE-----------------------------------------
#  scater_gui(example_sceset)

## ----sceset-make-sceset-counts-only--------------------------------------
pd <- new("AnnotatedDataFrame", data = sc_example_cell_info)
rownames(pd) <- pd$Cell
gene_df <- data.frame(Gene = rownames(sc_example_counts))
rownames(gene_df) <- gene_df$Gene
fd <- new("AnnotatedDataFrame", data = gene_df)
example_sceset <- newSCESet(countData = sc_example_counts, phenoData = pd,
                            featureData = fd)
example_sceset

## ----sceset-make-sceset-exprs-only---------------------------------------
example2 <- newSCESet(exprsData = edgeR::cpm.default(sc_example_counts))
pData(example2)
fData(example2)

## ----counts-accessor-----------------------------------------------------
counts(example2)[1:3, 1:6]

## ----exprs-accessor------------------------------------------------------
exprs(example2)[1:3, 1:6]

## ----get-exprs-----------------------------------------------------------
get_exprs(example_sceset, "counts")[1:3, 1:6]

## ----set-exprs-----------------------------------------------------------
set_exprs(example2, "counts") <- counts(example_sceset)

## ----sceset-add-count-data-----------------------------------------------
counts(example2) <- sc_example_counts
example2
counts(example2)[1:3, 1:6]

## ----sceset-demo-replacement---------------------------------------------
gene_df <- data.frame(Gene = rownames(sc_example_counts))
rownames(gene_df) <- gene_df$Gene
fd <- new("AnnotatedDataFrame", data = gene_df)
## replace featureData
fData(example_sceset) <- fd
## replace phenotype data
pData(example_sceset) <- pd
## replace expression data to be used
exprs(example_sceset) <- edgeR::cpm.default(counts(example_sceset),
                                            prior.count = 5, log = TRUE)

## ----plot-sceset-blocking------------------------------------------------
plot(example_sceset, block1 = "Mutation_Status", block2 = "Treatment",
     colour_by = "Cell_Cycle", nfeatures = 300, exprs_values = "counts")

## ----plot-expression-----------------------------------------------------
plotExpression(example_sceset, rownames(example_sceset)[1:6],
               x = "Mutation_Status", exprs_values = "exprs", colour = "Treatment")

## ----plot-expression-theme-bw--------------------------------------------
plotExpression(example_sceset, rownames(example_sceset)[7:12],
               x = "Mutation_Status", exprs_values = "counts", colour = "Cell_Cycle",
               show_median = TRUE, show_violin = FALSE,  xlab = "Mutation Status",
               log = TRUE)

## ----calc-qc-metrics-----------------------------------------------------
example_sceset <- calculateQCMetrics(example_sceset, feature_controls = 1:20)
varLabels(example_sceset)

## ----calc-qc-metrics-multi-feature-controls------------------------------
example_sceset <- calculateQCMetrics(
    example_sceset, feature_controls = list(controls1 = 1:20, controls2 = 500:1000),
    cell_controls = list(set_1 = 1:5, set_2 = 31:40))
varLabels(example_sceset)

## ----list-fdata-qc-------------------------------------------------------
names(fData(example_sceset))

## ----plot-qc-expression, fig.height=7.5, fig.width=8.5-------------------
keep_feature <- rowSums(counts(example_sceset) > 0) > 4
example_sceset <- example_sceset[keep_feature,]
## Plot QC
plotQC(example_sceset, type = "highest-expression", exprs_values = "counts")

## ----plot-qc-expression-cell-controls, fig.height=7.5, fig.width=8.5, fig.show=FALSE----
p1 <- plotQC(example_sceset[, !example_sceset$is_cell_control],
             type = "highest-expression")
p2 <- plotQC(example_sceset[, example_sceset$is_cell_control],
       type = "highest-expression")
multiplot(p1, p2, cols = 2)

## ----plot-qc-exprs-freq-vs-mean-default----------------------------------
plotQC(example_sceset, type = "exprs-freq-vs-mean")

## ----plot-qc-exprs-mean-vs-freq-defined-feature-set, results = 'hide', fig.show = FALSE----
feature_set_1 <- fData(example_sceset)$is_feature_control_controls1
plotQC(example_sceset, type = "exprs-freq-vs-mean", feature_set = feature_set_1)

## ----plot-fdata----------------------------------------------------------
plotFeatureData(example_sceset, aes(x = n_cells_exprs, y = pct_total_counts))

## ----plot-pdata, echo=FALSE, fig.show=FALSE, results='hide'--------------
plotPhenoData(example_sceset, aes(x = total_counts, y = total_features,
                                  colour = Mutation_Status))

## ----plot-pdata-cont-col, fig.show = TRUE--------------------------------
plotPhenoData(example_sceset, aes(x = Mutation_Status, y = total_features,
                                  colour = log10_total_counts))

## ----plot-pdata-col-gene-exprs-------------------------------------------
plotPhenoData(example_sceset, aes(x = total_counts, y = total_features,
                                  colour = Gene_1000))

## ----plot-pdatacol-gene-exprs-2, fig.show = FALSE------------------------
plotPhenoData(example_sceset, aes(x = pct_exprs_feature_controls,
                                  y = total_features, colour = Gene_0500))

## ----plot-pdatacol-gene-exprs-3, fig.show = FALSE------------------------
plotPhenoData(example_sceset, aes(x = pct_exprs_feature_controls,
                                  y = pct_exprs_top_50_features,
                                  colour = Gene_0001))

## ----plot-pdata-pct-exprs-controls---------------------------------------
plotPhenoData(example_sceset, aes(x = total_features,
                                  y = pct_exprs_feature_controls,
                                  colour = Mutation_Status)) +
    theme(legend.position = "top") +
    stat_smooth(method = "lm", se = FALSE, size = 2, fullrange = TRUE)

## ----plot-pca-default----------------------------------------------------
plotPCA(example_sceset)

## ----plot-pca-cpm, fig.show=FALSE----------------------------------------
plotPCA(example_sceset, exprs_values = "cpm")

## ----plot-pca-feature-controls, fig.show = FALSE-------------------------
plotPCA(example_sceset, feature_set = fData(example_sceset)$is_feature_control)

## ----plot-pca-4comp-colby-shapeby, fig.height=5.5------------------------
plotPCA(example_sceset, ncomponents = 4, colour_by = "Treatment",
        shape_by = "Mutation_Status")

## ----plot-pca-4comp-colby-sizeby-exprs, fig.height=5.5-------------------
plotPCA(example_sceset, colour_by = "Gene_0001", size_by = "Gene_1000")

## ----plot-tsne-1comp-colby-sizeby-exprs, fig.height=5.5------------------
plotTSNE(example_sceset, colour_by = "Gene_0001", size_by = "Gene_1000")

## ----plot-difmap-1comp-colby-sizeby-exprs, fig.height=5.5----------------
plotDiffusionMap(example_sceset, colour_by = "Gene_0001", size_by = "Gene_1000")

## ----plot-pca-4comp-colby-shapeby-save-pcs, fig.show = FALSE-------------
example_sceset <- plotPCA(example_sceset, ncomponents = 4,
                          colour_by = "Treatment", shape_by = "Mutation_Status",
                          return_SCESet = TRUE, theme_size = 12)
head(reducedDimension(example_sceset))

## ----plot-reduceddim-4comp-colby-shapeby, fig.show=FALSE-----------------
plotReducedDim(example_sceset, ncomponents = 4, colour_by = "Treatment",
               shape_by = "Mutation_Status")

## ----plot-reduceddim-4comp-colby-sizeby-exprs, fig.show = FALSE----------
plotReducedDim(example_sceset, ncomponents = 4, colour_by = "Gene_1000",
               size_by = "Gene_0500")

## ----plot-pca-outlier----------------------------------------------------
example_sceset <- plotPCA(example_sceset, pca_data_input = "pdata", 
                          detect_outliers = TRUE, return_SCESet = TRUE)


## ----plot-qc-expl-variables-all, warning=FALSE---------------------------
plotQC(example_sceset, type = "expl")

## ----plot-qc-expl-variables-select-variables-----------------------------
plotQC(example_sceset, type = "expl",
       variables = c("total_features", "total_counts", "Mutation_Status", "Treatment",
                     "Cell_Cycle"))

## ----plot-qc-pairs-pc----------------------------------------------------
plotQC(example_sceset, type = "expl", method = "pairs", theme_size = 6)

## ----plot-qc-find-pcs-pcs-vs-vars, fig.width=8, fig.height=7-------------
p1 <- plotQC(example_sceset, type = "find-pcs", variable = "total_features",
        plot_type = "pcs-vs-vars")
p2 <- plotQC(example_sceset, type = "find-pcs", variable = "Cell_Cycle",
       plot_type = "pcs-vs-vars")
multiplot(p1, p2, cols = 2)

## ----plot-qc-find-pcs-pairs, fig.width=10, fig.height=7------------------
plotQC(example_sceset, type = "find-pcs", variable = "total_features",
       plot_type = "pairs-pcs")

## ----plot-qc-find-pcs-pairs-2, fig.show=FALSE----------------------------
plotQC(example_sceset, type = "find-pcs", variable = "Cell_Cycle",
       plot_type = "pairs-pcs")

## ----cell-pairwise-distance-matrices-euclidean, eval=TRUE----------------
cell_dist <- dist(t(exprs(example_sceset)))
cellPairwiseDistances(example_sceset) <- cell_dist
plotMDS(example_sceset)

## ----cell-pairwise-distance-matrices-canberra, eval=TRUE, fig.show = FALSE----
cell_dist <- dist(t(counts(example_sceset)), method = "canberra")
cellPairwiseDistances(example_sceset) <- cell_dist
plotMDS(example_sceset, colour_by = "Mutation_Status")

## ----feature-pairwise-distance-matrices, eval=FALSE----------------------
#  feature_dist <- dist(exprs(example_sceset))
#  featurePairwiseDistances(example_sceset) <- feature_dist
#  limma::plotMDS(featDist(example_sceset))

## ----kallisto-demo-kallisto-test-data, eval=FALSE------------------------
#  ################################################################################
#  ### Tests and Examples
#  
#  # Example if in the kallisto/test directory
#  setwd("/home/davis/kallisto/test")
#  kallisto_log <- runKallisto("targets.txt", "transcripts.idx", single_end=FALSE,
#              output_prefix="output", verbose=TRUE, n_bootstrap_samples=10)
#  
#  sce_test <- readKallistoResults(kallisto_log, read_h5=TRUE)
#  sce_test

## ----kallisto-cell-cycle-example, eval=FALSE-----------------------------
#  setwd("/home/davis/021_Cell_Cycle/data/fastq")
#  system("wc -l targets.txt")
#  ave_frag_len <- mean(c(855, 860, 810, 760, 600, 690, 770, 690))
#  
#  kallisto_test <- runKallisto("targets.txt",
#                               "Mus_musculus.GRCm38.rel79.cdna.all.ERCC.idx",
#                               output_prefix="kallisto_output_Mmus", n_cores=12,
#                               fragment_length=ave_frag_len, verbose=TRUE)
#  sce_kall_mmus <- readKallistoResults(kallisto_test, read_h5=TRUE)
#  sce_kall_mmus
#  
#  sce_kall_mmus <- readKallistoResults(kallisto_test)
#  
#  sce_kall_mmus <- getBMFeatureAnnos(sce_kall_mmus)
#  
#  head(fData(sce_kall_mmus))
#  head(pData(sce_kall_mmus))
#  sce_kall_mmus[["start_time"]]
#  
#  counts(sce_kall_mmus)[sample(nrow(sce_kall_mmus), size=15), 1:6]
#  
#  ## Summarise expression at the gene level
#  sce_kall_mmus_gene <- summariseExprsAcrossFeatures(
#      sce_kall_mmus, exprs_values="tpm", summarise_by="feature_id")
#  
#  datatable(fData(sce_kall_mmus_gene))
#  
#  sce_kall_mmus_gene <- getBMFeatureAnnos(
#      sce_kall_mmus_gene, filters="ensembl_gene_id",
#      attributes=c("ensembl_gene_id", "mgi_symbol", "chromosome_name",
#                   "gene_biotype", "start_position", "end_position",
#                   "percentage_gc_content", "description"),
#      feature_symbol="mgi_symbol", feature_id="ensembl_gene_id",
#      biomart="ensembl", dataset="mmusculus_gene_ensembl")
#  
#  datatable(fData(sce_kall_mmus_gene))
#  
#  ## Add gene symbols to featureNames to make them more intuitive
#  new_feature_names <- featureNames(sce_kall_mmus_gene)
#  notna_mgi_symb <- !is.na(fData(sce_kall_mmus_gene)$mgi_symbol)
#  new_feature_names[notna_mgi_symb] <- fData(sce_kall_mmus_gene)$mgi_symbol[notna_mgi_symb]
#  notna_ens_gid <- !is.na(fData(sce_kall_mmus_gene)$ensembl_gene_id)
#  new_feature_names[notna_ens_gid] <- paste(new_feature_names[notna_ens_gid],
#            fData(sce_kall_mmus_gene)$ensembl_gene_id[notna_ens_gid], sep="_")
#  sum(duplicated(new_feature_names))
#  featureNames(sce_kall_mmus_gene) <- new_feature_names
#  head(featureNames(sce_kall_mmus_gene))
#  tail(featureNames(sce_kall_mmus_gene))
#  sum(duplicated(fData(sce_kall_mmus_gene)$mgi_symbol))
#  

