## globally defined constants
MAXCHARVARID <- 32
MAXCHARCHROMNAMES <- 32
MAXCHARALLELE <- 20

freezeKnownVariantsESP <- function(sourceURL=MafDbESPdefaultURL, targetdir=getwd(), MafDbFilename=MafDbFilename, genome="hg19") {
  tarballFilename <- tempfile(fileext=".vcf.tar.gz")
  vcfFilenames <- c()

  message(sprintf("Downloading known variants from the NHLBI Exome Sequencing Project (ESP) through the VCF file located at:\n%s", sourceURL))
  err <- download.file(url=sourceURL, destfile=tarballFilename, method="curl")
  if (err)
    stop(sprintf("could not download file from %s. Please check that your internet connection is working, that this URL is correct and that there is sufficient disk space in %s to store the file.", sourceURL, tempdir()))
  else {
    vcfFilenames <- untar(tarballFilename, list=TRUE)
    if (length(vcfFilenames) == 0)
      stop(sprintf("Tarball file at %s contains no files.", sourceURL))

    err <- untar(tarballFilename, exdir=dirname(tarballFilename))
    if (err)
      stop(sprintf("could not extract files from the downloaded file at %s. Please check that this URL points to a correct tarball file and that there is sufficient disk space in %s to store all contained files.", sourceURL, tempdir()))

    ## check that the tarball contains VCF files
    for (f in vcfFilenames) {
      tryCatch({
        vcfHeader <- scanVcfHeader(file.path(dirname(tarballFilename), f))
      }, error=function(err) {
        stop(sprintf("extracted VCF file %s has not a valid VCF header.", f), call.=TRUE)
      })
    }
  }

  param <- ScanVcfParam(geno=NA,
                        fixed="ALT",
                        info="MAF")

  conn <- dbConnect(SQLite(), dbname=file.path(targetdir, MafDbFilename))
  sql <- c("CREATE TABLE knownVariants (\n",
           sprintf("  varID VARCHAR(%d) NOT NULL,\n", MAXCHARVARID),
           sprintf("  chrom VARCHAR(%d) NOT NULL,\n", MAXCHARCHROMNAMES),
           "  start INTEGER NOT NULL,\n",
           sprintf("  ref VARCHAR(%d) NOT NULL,\n", MAXCHARALLELE),
           "  wref INTEGER NOT NULL,\n",
           sprintf("  alt VARCHAR(%d) NOT NULL,\n", MAXCHARALLELE),
           "  walt INTEGER NOT NULL,\n",
           "  AF CHAR(1),\n",
           "  EA_AF CHAR(1),\n",
           "  AA_AF CHAR(1)\n",
           ")")
  VariantFiltering:::dbEasyQuery(conn, paste(sql, collapse=""))

  message("Starting to process known variants")
  nVar <- 0
  i <- 1
  while (i <= length(vcfFilenames)) {
    f <- vcfFilenames[i]
    vcf <- readVcf(file.path(dirname(tarballFilename), f), genome=genome, param=param)
    rd <- rowData(vcf)
    ## at ESP multiple ALT alleles are coded within the same variant. normalize this
    ## information by duplicating rows
    elen <- elementLengths(rd$ALT)
    knownVariants <- data.frame(varID=rep(rownames(vcf), times=elen),
                                chrom=rep(as.character(seqnames(rd)), times=elen),
                                start=rep(start(rd), times=elen),
                                ref=rep(substring(as.character(rd$REF), 1, MAXCHARALLELE), times=elen),
                                wref=rep(width(rd$REF), times=elen),
                                alt=substring(as.character(unlist(rd$ALT)), 1, MAXCHARALLELE),
                                walt=width(as.character(unlist(rd$ALT))),
                                check.names=FALSE, stringsAsFactors=FALSE)
    ## according to http://evs.gs.washington.edu/EVS/HelpDescriptions.jsp MAF values are % in [0, 100]
    ## we normalize them to [0, 1] to be consistent with those from 1000 Genomes Project
    mafValues <- matrix(as.numeric(unlist(info(vcf)$MAF)), byrow=TRUE, ncol=3) / 100
    mafValues <- data.frame(AF=rep(rawToChar(codeAF2RAW(mafValues[, 3]), multiple=TRUE), times=elen),
                            EA_AF=rep(rawToChar(codeAF2RAW(mafValues[, 1]), multiple=TRUE), times=elen),
                            AA_AF=rep(rawToChar(codeAF2RAW(mafValues[, 2]), multiple=TRUE), times=elen))
    knownVariants <- cbind(knownVariants, mafValues, stringsAsFactors=FALSE)                      
    sql <- sprintf("INSERT INTO knownVariants VALUES (%s)", paste(rep("?", ncol(knownVariants)), collapse=","))
    VariantFiltering:::dbEasyPreparedQuery(conn, paste(sql, collapse=""), knownVariants)
    i <- i + 1
    nVar <- nVar + nrow(knownVariants)
    message(sprintf("%d known variants from the NHLBI Exome Sequencing Project (ESP) processed", nVar))
  }

  thispkg_version <- installed.packages()['VariantFiltering', 'Version']
  rsqlite_version <- installed.packages()['RSQLite', 'Version']

  metadata <- data.frame(name=c("Db type", "Supporting package", "Data source", "Data source tag",
                                "Resource URL", "Number of variants", "Db created by", "Creation date",
                                "VariantFiltering version at creation time", "RSQLite version at creation time"),
                         value=c("MafDb", "VariantFiltering", "NHLBI Exome Sequencing Project", "ESP", sourceURL,
                                 sprintf("%d", nVar), "VariantFiltering package", date(), thispkg_version,
                                 rsqlite_version))

  sql <- c("CREATE TABLE metadata (\n",
           "  name TEXT NOT NULL,\n",
           "  value TEXT NOT NULL)\n")
  VariantFiltering:::dbEasyQuery(conn, paste(sql, collapse=""))
  sql <- sprintf("INSERT INTO metadata VALUES (%s)", paste(rep("?", ncol(metadata)), collapse=","))
  VariantFiltering:::dbEasyPreparedQuery(conn, paste(sql, collapse=""), metadata)

  dbDisconnect(conn)

  file.path(targetdir, MafDbFilename)
}

## adapted from makeTxDbPackage() in GenomicFeatures/R/makeTxDbPackage.R
makeMafDbPackageESP <- function(destDir=path.expand("~"), MafDbURL=MafDbESPdefaultURL,
                                MafDbPkgName=MafDbESPdefaultPkgName, genome="hg19",
                                version=NULL, author=NULL, maintainer=NULL, license=NULL,
                                yieldSize=1000000) {

  thisPkg <- getAnywhere("makeMafDbPackageESP")
  thisPkgName <- sub("package:", "", thisPkg$where[grep("package", thisPkg$where)[1]])

  if (is.null(version)) { ## by default custom versions of MafDb.* have x.y.z version numbers with y being odd
    version <- packageDescription(thisPkgName)$Version
    x <- as.integer(gsub(".[0-9]+.[0-9]+$", "", version))
    y <- as.integer(gsub("[0-9]+$", "", substring(version, gregexpr("[0-9]+.[0-9]+$", version)[[1]])))
    z <- as.integer(substring(version, gregexpr("[0-9]+$", version)[[1]]))
    if (y %% 2 == 0) {
      y <- y + 1
      z <- 0
    } else
      z <- z + 1

    version <- paste(x, y, z, sep=".")
  }

  if (is.null(author))
    author <- packageDescription(thisPkgName)$Author

  if (is.null(maintainer))
    maintainer <- maintainer(thisPkgName)

  if (is.null(license))
    license <- packageDescription(thisPkgName)$License

  symvals <- list(PKGTITLE="Minimum allele frequency data from NHLBI ESP",
                  PKGDESCRIPTION="Minimum allele frequency data frozen from the NHLBI Exome Sequencing Project",
                  PKGVERSION=version,
                  AUTHOR=author,
                  MAINTAINER=maintainer,
                  LIC=license)

  res <- createPackage(pkgname=MafDbPkgName,
                       destinationDir=destDir,
                       originDir=system.file("MafDbPkgTemplate", package=thisPkgName),
                       symbolValues=symvals)

  suc <- file.copy(from=system.file("MafDbPkgTemplate", package=thisPkgName),
                   to=file.path(destDir, MafDbPkgName, "inst"), recursive=TRUE)
  if (!suc)
    stop(sprintf("Cannot write in the %s directory.", file.path(destDir, MafDbPkgName, "inst")))

  frozenDataDir <- file.path(destDir, MafDbPkgName, "inst", "extdata")

  dbnameESP <- freezeKnownVariantsESP(sourceURL=MafDbURL, targetdir=frozenDataDir,
                                      MafDbFilename=paste0(MafDbPkgName, ".sqlite"),
                                      genome=genome)

  message(sprintf("A new MafDb package has been created in directory %s.", file.path(destDir, MafDbPkgName)))
  message(sprintf("Please run the command lines:\n\n   R CMD build --keep-empty-dirs %s", file.path(destDir, MafDbPkgName)))
  message(sprintf("   R CMD INSTALL %s_%s.tar.gz", MafDbPkgName, version))
  message("\nto install the package on R.")

  invisible(res$pkgdir)
}
