## ----style, eval=TRUE, echo=FALSE, results="asis"---------------------------------------
BiocStyle::latex()

## ---------------------------------------------------------------------------------------
# Load the R packages: gdsfmt and SNPRelate
library(gdsfmt)
library(SNPRelate)

## ---------------------------------------------------------------------------------------
snpgdsSummary(snpgdsExampleFileName())

## ---------------------------------------------------------------------------------------
# Open a GDS file
(genofile <- snpgdsOpen(snpgdsExampleFileName()))

## ---------------------------------------------------------------------------------------
# Get the attributes of chromosome coding
get.attr.gdsn(index.gdsn(genofile, "snp.chromosome"))

## ---------------------------------------------------------------------------------------
# Take out genotype data for the first 3 samples and the first 5 SNPs
(g <- read.gdsn(index.gdsn(genofile, "genotype"), start=c(1,1), count=c(5,3)))

## ----eval=FALSE-------------------------------------------------------------------------
#  g <- snpgdsGetGeno(genofile, sample.id=..., snp.id=...)

## ---------------------------------------------------------------------------------------
# Get the attribute of genotype
get.attr.gdsn(index.gdsn(genofile, "genotype"))

## ---------------------------------------------------------------------------------------
# Take out snp.id
head(read.gdsn(index.gdsn(genofile, "snp.id")))
# Take out snp.rs.id
head(read.gdsn(index.gdsn(genofile, "snp.rs.id")))

## ---------------------------------------------------------------------------------------
# Read population information
pop <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group"))
table(pop)

# Close the GDS file
snpgdsClose(genofile)

## ---------------------------------------------------------------------------------------
# Load data
data(hapmap_geno)

# Create a gds file
snpgdsCreateGeno("test.gds", genmat = hapmap_geno$genotype,
    sample.id = hapmap_geno$sample.id, snp.id = hapmap_geno$snp.id,
    snp.chromosome = hapmap_geno$snp.chromosome,
    snp.position = hapmap_geno$snp.position,
    snp.allele = hapmap_geno$snp.allele, snpfirstdim=TRUE)

# Open the GDS file
(genofile <- snpgdsOpen("test.gds"))

# Close the GDS file
snpgdsClose(genofile)

## ----eval=FALSE-------------------------------------------------------------------------
#  # Create a new GDS file
#  newfile <- createfn.gds("your_gds_file.gds")
#  
#  # add a flag
#  put.attr.gdsn(newfile$root, "FileFormat", "SNP_ARRAY")
#  
#  # Add variables
#  add.gdsn(newfile, "sample.id", sample.id)
#  add.gdsn(newfile, "snp.id", snp.id)
#  add.gdsn(newfile, "snp.position", snp.position)
#  add.gdsn(newfile, "snp.chromosome", snp.chromosome)
#  add.gdsn(newfile, "snp.allele", c("A/G", "T/C", ...))
#  
#  
#  #####################################################################
#  # Create a snp-by-sample genotype matrix
#  
#  # Add genotypes
#  var.geno <- add.gdsn(newfile, "genotype",
#      valdim=c(length(snp.id), length(sample.id)), storage="bit2")
#  
#  # Indicate the SNP matrix is snp-by-sample
#  put.attr.gdsn(var.geno, "snp.order")
#  
#  # Write SNPs into the file sample by sample
#  for (i in 1:length(sample.id))
#  {
#      g <- ...
#      write.gdsn(var.geno, g, start=c(1,i), count=c(-1,1))
#  }
#  
#  
#  #####################################################################
#  # OR, create a sample-by-snp genotype matrix
#  
#  # Add genotypes
#  var.geno <- add.gdsn(newfile, "genotype",
#      valdim=c(length(sample.id), length(snp.id)), storage="bit2")
#  
#  # Indicate the SNP matrix is sample-by-snp
#  put.attr.gdsn(var.geno, "sample.order")
#  
#  # Write SNPs into the file sample by sample
#  for (i in 1:length(snp.id))
#  {
#      g <- ...
#      write.gdsn(var.geno, g, start=c(1,i), count=c(-1,1))
#  }
#  
#  
#  
#  # Get a description of chromosome codes
#  #   allowing to define a new chromosome code, e.g., snpgdsOption(Z=27)
#  option <- snpgdsOption()
#  var.chr <- index.gdsn(newfile, "snp.chromosome")
#  put.attr.gdsn(var.chr, "autosome.start", option$autosome.start)
#  put.attr.gdsn(var.chr, "autosome.end", option$autosome.end)
#  for (i in 1:length(option$chromosome.code))
#  {
#      put.attr.gdsn(var.chr, names(option$chromosome.code)[i],
#          option$chromosome.code[[i]])
#  }
#  
#  # Add your sample annotation
#  samp.annot <- data.frame(sex = c("male", "male", "female", ...),
#      pop.group = c("CEU", "CEU", "JPT", ...), ...)
#  add.gdsn(newfile, "sample.annot", samp.annot)
#  
#  # Add your SNP annotation
#  snp.annot <- data.frame(pass=c(TRUE, TRUE, FALSE, FALSE, TRUE, ...), ...)
#  add.gdsn(newfile, "snp.annot", snp.annot)
#  
#  
#  # Close the GDS file
#  closefn.gds(newfile)

## ---------------------------------------------------------------------------------------
# The PLINK BED file, using the example in the SNPRelate package
bed.fn <- system.file("extdata", "plinkhapmap.bed", package="SNPRelate")
fam.fn <- system.file("extdata", "plinkhapmap.fam", package="SNPRelate")
bim.fn <- system.file("extdata", "plinkhapmap.bim", package="SNPRelate")

## ----eval=FALSE-------------------------------------------------------------------------
#  bed.fn <- "C:/your_folder/your_plink_file.bed"
#  fam.fn <- "C:/your_folder/your_plink_file.fam"
#  bim.fn <- "C:/your_folder/your_plink_file.bim"

## ---------------------------------------------------------------------------------------
# Convert
snpgdsBED2GDS(bed.fn, fam.fn, bim.fn, "test.gds")

# Summary
snpgdsSummary("test.gds")

## ---------------------------------------------------------------------------------------
# The VCF file, using the example in the SNPRelate package
vcf.fn <- system.file("extdata", "sequence.vcf", package="SNPRelate")

## ----eval=FALSE-------------------------------------------------------------------------
#  vcf.fn <- "C:/your_folder/your_vcf_file.vcf"

## ---------------------------------------------------------------------------------------
# Reformat
snpgdsVCF2GDS(vcf.fn, "test.gds", method="biallelic.only")

# Summary
snpgdsSummary("test.gds")

## ---------------------------------------------------------------------------------------
# Open the GDS file
genofile <- snpgdsOpen(snpgdsExampleFileName())

## ---------------------------------------------------------------------------------------
# Get population information
#   or pop_code <- scan("pop.txt", what=character())
#   if it is stored in a text file "pop.txt"
pop_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group"))

# Display the first six values
head(pop_code)

## ---------------------------------------------------------------------------------------
set.seed(1000)

# Try different LD thresholds for sensitivity analysis
snpset <- snpgdsLDpruning(genofile, ld.threshold=0.2)
names(snpset)
head(snpset$chr1)  # snp.id

# Get all selected snp id
snpset.id <- unlist(snpset)

## ---------------------------------------------------------------------------------------
# Run PCA
pca <- snpgdsPCA(genofile)

## ---------------------------------------------------------------------------------------
# variance proportion (%)
pc.percent <- pca$varprop*100
head(round(pc.percent, 2))

## ----fig.width=5, fig.height=5, fig.align='center'--------------------------------------
# make a data.frame
tab <- data.frame(sample.id = pca$sample.id,
    EV1 = pca$eigenvect[,1],    # the first eigenvector
    EV2 = pca$eigenvect[,2],    # the second eigenvector
    stringsAsFactors = FALSE)
head(tab)

# Draw
plot(tab$EV2, tab$EV1, xlab="eigenvector 2", ylab="eigenvector 1")

## ---------------------------------------------------------------------------------------
# Get sample id
sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))

# Get population information
#   or pop_code <- scan("pop.txt", what=character())
#   if it is stored in a text file "pop.txt"
pop_code <- read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))

# assume the order of sample IDs is as the same as population codes
head(cbind(sample.id, pop_code))

## ----fig.width=5, fig.height=5, fig.align='center'--------------------------------------
# Make a data.frame
tab <- data.frame(sample.id = pca$sample.id,
    pop = factor(pop_code)[match(pca$sample.id, sample.id)],
    EV1 = pca$eigenvect[,1],    # the first eigenvector
    EV2 = pca$eigenvect[,2],    # the second eigenvector
    stringsAsFactors = FALSE)
head(tab)

# Draw
plot(tab$EV2, tab$EV1, col=as.integer(tab$pop),
    xlab="eigenvector 2", ylab="eigenvector 1")
legend("topleft", legend=levels(tab$pop), pch="o", col=1:nlevels(tab$pop))

## ----fig.width=5, fig.height=5, fig.align='center'--------------------------------------
lbls <- paste("PC", 1:4, "\n", format(pc.percent[1:4], digits=2), "%", sep="")
pairs(pca$eigenvect[,1:4], col=tab$pop, labels=lbls)

## ----fig.width=5, fig.height=5, fig.align='center'--------------------------------------
# Get chromosome index
chr <- read.gdsn(index.gdsn(genofile, "snp.chromosome"))
CORR <- snpgdsPCACorr(pca, genofile, eig.which=1:4)

par( mfrow=c(3,1))
for (i in 1:3)
{
    plot(abs(CORR$snpcorr[i,]), ylim=c(0,1), xlab="SNP Index",
        ylab=paste("PC", i), col=chr, pch="+")
}

## ---------------------------------------------------------------------------------------
# YRI samples
sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
YRI.id <- sample.id[pop_code == "YRI"]

## ---------------------------------------------------------------------------------------
# Estimate IBD coefficients
ibd <- snpgdsIBDMoM(genofile, sample.id=YRI.id, snp.id=snpset.id,
    maf=0.05, missing.rate=0.05)

# Make a data.frame
ibd.coeff <- snpgdsIBDSelection(ibd)
head(ibd.coeff)

## ----fig.width=5, fig.height=5, fig.align='center'--------------------------------------
plot(ibd.coeff$k0, ibd.coeff$k1, xlim=c(0,1), ylim=c(0,1),
    xlab="k0", ylab="k1", main="YRI samples (MoM)")
lines(c(0,1), c(1,0), col="red", lty=2)

## ----eval=FALSE-------------------------------------------------------------------------
#  # Estimate IBD coefficients
#  set.seed(1000)
#  snp.id <- sample(snpset.id, 5000)  # random 5000 SNPs
#  ibd <- snpgdsIBDMLE(genofile, sample.id=YRI.id, snp.id=snp.id,
#      maf=0.05, missing.rate=0.05)

## ----eval=FALSE-------------------------------------------------------------------------
#  # Make a data.frame
#  ibd.coeff <- snpgdsIBDSelection(ibd)

## ----eval=FALSE-------------------------------------------------------------------------
#  plot(ibd.coeff$k0, ibd.coeff$k1, xlim=c(0,1), ylim=c(0,1),
#      xlab="k0", ylab="k1", main="YRI samples (MLE)")
#  lines(c(0,1), c(1,0), col="red", lty=2)

## ---------------------------------------------------------------------------------------
# Incorporate with pedigree information
family.id <- read.gdsn(index.gdsn(genofile, "sample.annot/family.id"))
family.id <- family.id[match(YRI.id, sample.id)]
table(family.id)

ibd.robust <- snpgdsIBDKING(genofile, sample.id=YRI.id, family.id=family.id)
names(ibd.robust)

# Pairs of individuals
dat <- snpgdsIBDSelection(ibd.robust)
head(dat)

## ----fig.width=5, fig.height=5, fig.align='center'--------------------------------------
plot(dat$IBS0, dat$kinship, xlab="Proportion of Zero IBS",
    ylab="Estimated Kinship Coefficient (KING-robust)")

## ---------------------------------------------------------------------------------------
ibs <- snpgdsIBS(genofile, num.thread=2)

## ----fig.width=5, fig.height=5, fig.align='center'--------------------------------------
library(lattice)

L <- order(pop_code)
levelplot(ibs$ibs[L, L], col.regions = terrain.colors)

## ---------------------------------------------------------------------------------------
loc <- cmdscale(1 - ibs$ibs, k = 2)
x <- loc[, 1]; y <- loc[, 2]
race <- as.factor(pop_code)

## ----fig.width=5, fig.height=5, fig.align='center'--------------------------------------
plot(x, y, col=race, xlab = "", ylab = "",
    main = "Multidimensional Scaling Analysis (IBS Distance)")
legend("topleft", legend=levels(race), text.col=1:nlevels(race))

## ----fig.width=5, fig.height=5, fig.align='center'--------------------------------------
set.seed(100)
ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, num.thread=2))

# Determine groups of individuals automatically
rv <- snpgdsCutTree(ibs.hc)
plot(rv$dendrogram, leaflab="none", main="HapMap Phase II")
table(rv$samp.group)

## ---------------------------------------------------------------------------------------
# Determine groups of individuals by population information
rv2 <- snpgdsCutTree(ibs.hc, samp.group=as.factor(pop_code))

## ----fig.width=5, fig.height=5, fig.align='center'--------------------------------------
plot(rv2$dendrogram, leaflab="none", main="HapMap Phase II")
legend("topright", legend=levels(race), col=1:nlevels(race), pch=19, ncol=4)

## ---------------------------------------------------------------------------------------
# Close the GDS file
snpgdsClose(genofile)

## ----sessioninfo, results="asis"--------------------------------------------------------
toLatex(sessionInfo())

