ABPkgBuilder           package:AnnBuilder           R Documentation

_F_u_n_c_t_i_o_n_s _t_h_a_t _s_u_p_p_o_r_t _a _s_i_n_g_l_e _A_P_I _f_o_r _b_u_i_l_d_i_n_g _d_a_t_a _p_a_c_k_a_g_e_s

_D_e_s_c_r_i_p_t_i_o_n:

     These functions support a single API represented by ABPkgBuilder
     to allow users to build annotation data packages by providing a
     limited number of parameters. Other parameters will be figured out
     by the supporting functions.

_U_s_a_g_e:

     ABPkgBuilder(baseName, srcUrls, baseMapType = c("gb", "ug", "ll", "image",
     "refseq", "gbNRef"), otherSrc = NULL, pkgName, pkgPath, organism,
     version, author, fromWeb = TRUE, lazyLoad = TRUE)
     getBaseParsers(baseMapType = c("gb", "ug", "image", "ll", "refseq",
     "gbNRef", "ll2gb", "gb2ll", "eggo", "eginfo", "egrefseq", "egpubmed", "egunigene", "egmim")))
     createEmptyDPkg(pkgName, pkgPath, folders, force = TRUE)
     getDirContent(dirName, exclude = NULL)
     getMultiColNames()
     getUniColNames()
     getTypeColNames()
     splitEntry(dataRow, sep = ";", asNumeric = FALSE)
     twoStepSplit(dataRow, entrySep = ";", eleSep = "@", asNumeric = FALSE)
     saveMat(data, pkgName, pkgPath, envName, keyCol = 1,
                              valCol = 2, fun = function(x) x)
     saveList(dList, pkgName, pkgPath, envName)
     nameGOByCat(GOWithEvi, goCat)
     getChrLengths(organism)
     getHumanChrLengths()
     getMouseChrLengths()
     getRatChrLengths()
     getYeastChrLengths()
     getList4GO(goNCat, goNEvi)
     vect2List(vector, vectNames)
     resumeSrcUrl(srcObjs, organism)
     writeDatalist(pkgName, pkgPath)
     getEGAccName()

_A_r_g_u_m_e_n_t_s:

baseName: 'baseName' a character string for the name of a file to be
          used as a base file to base source data. The file is assumed
          to have two columns (separated by tabs "\t") with the first
          one being the names of genes (probes) to be annotated and the
          second one being the maps to GenBank accession numbers,
          UniGene ids, image clone ids or LocusLink ids

 srcUrls: 'srcUrls' a vector of named character strings for the urls
          where source data files will be retrieved. Valid sources are
          LocusLink, UniGene, Golden Path, Gene Ontology, and KEGG. The
          names for the character strings should be LL, UG, GP, GO, and
          KEGG, respectively. LL and UG are required

baseMapType: 'baseMapType' a character string that is either "gb","ug",
          "image", "ll", "image", "refseq", "gbNRef" to indicate
          whether the probe ids in baseName are mapped to GenBack
          accession numbers, UniGene ids, image clone ids, LocusLink
          ids, RefSeq ids, or a mixture of GenBank accession numbers
          and RefSeq ids

otherSrc: 'otherSrc' a vector of named character strings for the names
          of files that contain mappings between probe ids of baseName
          and LobusLink ids that will be used to obtain the unified
          mappings between probe ids of baseName and LocusLink ids
          based on all the sources. The strings should not contain any
          number and the files have the same structure as baseName

 pkgName: 'pkgName' a character string for the name of the data package
          to be built (e. g. hgu95a, rgu34a)

 pkgPath: 'pkgPath' a character string for the full path of an existing
          directory where the built package will be stored

organism: 'organism' a character string for the name of the organism of
          concern (now can only be "human", "mouse", or "rat")

 version: 'version' a character string for the version number

  author: 'author' a list of character strings with an author element
          for the name of the author and maintainer element for the
          email address of the author

   force: 'force' a boolean that is set to TRUE if the package to be
          created will replace an existing package with the same name

 dirName: 'dirName' a character string for the name of a directory
          whose contents are of interests

 exclude: 'exclude' a character string for a pattern matching parameter
          that will be used to exclude contents of a directory that
          match the pattern

 dataRow: 'dataRow' a character string containing data elements with
          elements separated by 'sep' or 'entrySep' and a descriptive
          string attached to each element following 'eleSep'

     sep: 'sep' a character string for a separator

entrySep: 'entrySep' a character string for a separator

  eleSep: 'eleSep' a character string for a separator

asNumeric: 'asNumeric' a boolean that is TRUE when the splited values
          will be returned as numeric values

 fromWeb: 'fromWeb' a boolean to indicate whether the source data will
          be downloaded from the web or read from a local file

 folders: 'folders' a vector of character strings for the names of
          folders to be created within a package that is going to be
          created

    data: 'data' a data matrix to be written as an environment object

   dList: 'dList' a list to be written an an environment object

 envName: 'envName' a character string for the name of an environment
          object to be written as keys in an environment

  keyCol: 'keyCol' a numeric number indicating the column of a matrix
          that contains keys

  valCol: 'valCol' a numeric number indicating the column of a matris
          that contains data that will be written as values in an
          environmnet

     fun: 'fun'an R function that will be passed as an argumnet

GOWithEvi: 'goWithEvi' a vector of character string in the format of
          "GO:xxxx@TS;GO:xxxxx@P;..." where letters following "@" are
          evidence code

   goCat: 'goCat' a matrix with the first column being GO ids and the
          second column being GO categories

  goNCat: 'goNCat' a named vector with GO category as the values and GO
          id as the names

  goNEvi: 'goNEvi' a list of named vectors with GO ids as values for
          vectors and evidence code as names for vector values

  vector: 'vector' a vector that is going to be converted to a list
          using 'as.list'

vectNames: 'vectNames' a vector of character of string for the names of
          'vector' that is going to converted to a list

 srcObjs: 'srcobjs' a list that contains objects of the pubRepo class

lazyLoad: 'lazyLoad' a boolean indicating whether a lazy load database
          will be created

_D_e_t_a_i_l_s:

     These functions are the results of an effort to make data package
     building easier for urers. As the results, users may not have
     great power controlling the process or inputs. Additionally, some
     of the built in functions that figure out the urls for source data
     may fail when maintainers of the data source web sites change the
     name, structure, ect of the source data. When such event occurs,
     users may have to follow the instructions contained in a vignette
     named AnnBuilder to build data packages.

     'getBaseParsers' figures out which of the built in parsers to use
     to parse the source data based on the type of the mappings done
     for the probes.

     'createEmptyDPkg' creates an empty package with the required
     subdirectories for data to be stored.

     'getMultiColNames' figures out what data elements for annotation
     have many to one relations with a probe. The many parts are
     separated by a separator in parsed annotation data.

     'getUniColNames' figures out what data elements for annotation
     have one to one relations with a probe.

     'getTypeColNames' figures out what data elements for annotation
     have many to one relations with a probe and additional information
     appended to the end of each element following a separate. The many
     parts are also separated by a separator in parsed annotation data.

     'splitEntry' splits entries by a separator.

     'twoStepSplit' splits entries by the separator specified by sep
     and the descriptive information of each element by eleSep.

_V_a_l_u_e:

     'getBaseParsers' returns a named vector for the names of the
     parsers to use to parse the source data.

     'getDirContent' returns a vector of chracter strings for the
     content of a directory of interests.

     'getMultiColNames' returns a vector of character strings.

     'getUniColNames' returns a vector of character strings.

     'getTypeColNames' returns a vector of character strings.

     'splitEntry' returns a vector of character strings.

     'twoStepSplit' returns a named vector of character strings. The
     names are the descriptive information appended to each element by
     'eleSep'

_N_o_t_e:

     The functions are part of the Bioconductor project at Dana-Farber
     Cancer Institute to provide Bioinformatics functionalities through
     R

_A_u_t_h_o_r(_s):

     Jianhua Zhang

_R_e_f_e_r_e_n_c_e_s:

     ABPrimer and AnnBuilder vignettes

_S_e_e _A_l_s_o:

     'GOPkgBuilder','KEGGPkgBuilder'

_E_x_a_m_p_l_e_s:

     # Create a temporary directory for the data
     myDir <- tempdir()
     # Create a temp base data file
     geneNMap <- matrix(c("32468_f_at", "D90278", "32469_at", "L00693",
                        "32481_at", "AL031663", "33825_at", " X68733",
                        "35730_at", "X03350", "36512_at", "L32179",
                        "38912_at", "D90042", "38936_at", "M16652",
                        "39368_at", "AL031668"), ncol = 2, byrow = TRUE)
     write.table(geneNMap, file = file.path(myDir, "geneNMap"),
     sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
     # Urls for truncated versions of source data
     mySrcUrls <- c(LL =
                    "http://www.bioconductor.org/datafiles/wwwsources/Tll_tmpl.gz", UG = "http://www.bioconductor.org/datafiles/wwwsources/Ths.data.gz", 
     GO = "http://www.bioconductor.org/datafiles/wwwsources/Tgo.xml")
     # Create temp files for other sources
     temp <- matrix(c("32468_f_at", NA, "32469_at", "2",
                        "32481_at", NA, "33825_at", " 9",
                        "35730_at", "1576", "36512_at", NA,
                        "38912_at", "10", "38936_at", NA,
                        "39368_at", NA), ncol = 2, byrow = TRUE)
     write.table(temp, file = file.path(myDir, "srcone"), sep = "\t",
     quote = FALSE, row.names = FALSE, col.names = FALSE)
     temp <- matrix(c("32468_f_at", NA, "32469_at", NA,
                        "32481_at", "7051", "33825_at", NA,
                        "35730_at", NA, "36512_at", "1084",
                        "38912_at", NA, "38936_at", NA,
                        "39368_at", "89"), ncol = 2, byrow = TRUE)
     write.table(temp, file = file.path(myDir, "srctwo"), sep = "\t",
     quote = FALSE, row.names = FALSE, col.names = FALSE)
     otherMapping <- c(srcone = file.path(myDir, "srcone"),
     srctwo = file.path(myDir, "srctwo"))
     # Runs only upon user's request
     if(interactive()){
     ABPkgBuilder(baseName = file.path(myDir, "geneNMap"),
     srcUrls = mySrcUrls, baseMapType = "gb", otherSrc = otherMapping,
     pkgName = "myPkg", pkgPath = myDir, organism = "Homo sapiens", version =
     "1.1.0", makeXML = TRUE, author = c(author = "myname", maintainer =
     "myname@myemail.com")) 
     # Output files
     list.files(myDir)
     # Content of the data package
     list.files(file.path(myDir, "myPkg"))
     list.files(file.path(myDir, "myPkg", "data"))
     list.files(file.path(myDir, "myPkg", "man"))
     list.files(file.path(myDir, "myPkg", "R"))
     unlink(file.path(myDir, "myPkg"), TRUE)
     unlink(file.path(myDir, "myPkg.xml"))
     unlink(file.path(myDir, "myPkgByNum.xml")) 
     }
     unlink(c(file.path(myDir, "geneNMap"), file.path(myDir, "srcone"),
     file.path(myDir, "srctwo")))

