matchPDict            package:Biostrings            R Documentation

_S_e_a_r_c_h_i_n_g _a _s_e_q_u_e_n_c_e _f_o_r _p_a_t_t_e_r_n_s _s_t_o_r_e_d _i_n _a _d_i_c_t_i_o_n_a_r_y

_D_e_s_c_r_i_p_t_i_o_n:

     'matchPDict' efficiently finds all occurences in a text (the
     subject) of any pattern from a set of patterns (the dictionary).

     The implementation of 'matchPDict' is based on the Aho-Corasick
     algorithm and is very fast.

     Note that, for now, 'matchPDict' only works with a dictionary of
     DNA patterns.

     This man page shows how to use 'matchPDict' for exact matching of
     a constant width dictionary i.e. a dictionary where all the
     patterns have the same length (same number of nucleotides).

     See matchPDict-inexact for inexact matching (or for using a
     variable width dictionary).

_U_s_a_g_e:

       matchPDict(pdict, subject, algorithm="auto", max.mismatch=0, fixed=TRUE)
       countPDict(pdict, subject, algorithm="auto", max.mismatch=0, fixed=TRUE)

       ## Manipulation of the MIndex object returned by matchPDict()
       startIndex(x, all.names=FALSE)
       endIndex(x, all.names=FALSE)
       countIndex(x, all.names=FALSE)
       unlist(x, recursive=TRUE, use.names=TRUE)
       extractAllMatches(subject, mindex)

_A_r_g_u_m_e_n_t_s:

   pdict: The preprocessed dictionary (see below for the details). 

 subject: An XString object containing the subject string. For now,
          only XString subjects of subtype DNAString are supported. 

algorithm: Not supported yet. 

max.mismatch: The maximum number of mismatching letters allowed (see
          'isMatching' for the details). When used on a constant width
          dictionary, 'matchPDict' (and 'countPDict') only support
          exact matching so 'max.mismatch' must be zero. See
          matchPDict-inexact for inexact matching. 

   fixed: If 'FALSE' then IUPAC extended letters are interpreted as
          ambiguities (see 'isMatching' for the details). When used on
          a constant width dictionary, 'matchPDict' (and 'countPDict')
          only support exact matching so 'fixed' must be 'TRUE'. See
          matchPDict-inexact for inexact matching. 

       x: A PDict or MIndex object for 'names'. An MIndex object for
          the other methods. 

all.names: 'TRUE' or 'FALSE'. 

recursive: ignored. 

use.names: ignored. 

  mindex: An MIndex object returned by a previous call to 'matchPDict'. 

_V_a_l_u_e:

     'matchPDict' returns an MIndex object.

     'countPDict' returns an integer vector.

     'startIndex', 'endIndex' and 'countIndex' return vectors of the
     same length as the input dictionary: 'startIndex' and 'endIndex'
     return a list of integer vectors, and 'countIndex' returns an
     integer vector.

     'extractAllMatches' returns an XStringViews object with names.

_A_u_t_h_o_r(_s):

     H. Pages

_S_e_e _A_l_s_o:

     PDict, matchPDict-inexact, 'isMatching', 'coverage',
     'matchPattern', 'alphabetFrequency', XStringViews-class,
     DNAString-class

_E_x_a_m_p_l_e_s:

       ## ---------------------------------------------------------------------
       ## A. WITH UNNAMED PATTERNS
       ## ---------------------------------------------------------------------

       ## Creating the pattern dictionary
       library(drosophila2probe)
       dict0 <- drosophila2probe$sequence   # The input dictionary.
       length(dict0)                        # Hundreds of thousands of patterns.
       pdict0 <- PDict(dict0)               # Store the input dictionary into a
                                            # PDict object (preprocessing).

       ## Using the pattern dictionary on chromosome 3R
       library(BSgenome.Dmelanogaster.UCSC.dm3)
       chr3R <- Dmelanogaster$chr3R         # Load chromosome 3R
       chr3R
       mindex <- matchPDict(pdict0, chr3R)  # Search...

       ## Looking at the matches
       start_index <- startIndex(mindex)    # Get the start index.
       length(start_index)                  # Same as the input dictionary.
       start_index[[8220]]                  # Starts of the 8220th pattern.
       end_index <- endIndex(mindex)        # Get the end index.
       end_index[[8220]]                    # Ends of the 8220th pattern.
       count_index <- countIndex(mindex)    # Get the number of matches per pattern.
       count_index[[8220]]
       mindex[[8220]]                       # Get the matches for the 8220th pattern.
       start(mindex[[8220]])                # Equivalent to startIndex(mindex)[[8220]].
       sum(count_index)                     # Total number of matches.
       table(count_index)
       i0 <- which(count_index == max(count_index))
       dict0[i0]                            # The pattern with most occurences.
       mindex[[i0]]                         # Its matches as a Views object.
       views(chr3R, start_index[[i0]], end_index[[i0]]) # And as an XStringViews object.

       ## Get the coverage of the original subject
       cov3R <- coverage(mindex, 1, length(chr3R))
       max(cov3R)
       mean(cov3R)
       sum(cov3R != 0) / length(cov3R)      # Only 2.44
       if (interactive()) {
         plotCoverage <- function(coverage, start, end)
         {
           plot.new()
           plot.window(c(start, end), c(0, 20))
           axis(1)
           axis(2)
           axis(4)
           lines(start:end, coverage[start:end], type="l")
         }
         plotCoverage(cov3R, 27600000, 27900000)
       }

       ## ---------------------------------------------------------------------
       ## B. WITH NAMED PATTERNS
       ## ---------------------------------------------------------------------

       dict1 <- dict0[8211:8236]            # The input dictionary.
       names(dict1) <- LETTERS
       dict1[1:5]
       pdict1 <- PDict(dict1)
       length(pdict1)                       # Same as the input dictionary.
       names(pdict1)                        # Same as names(dict1).
       mindex <- matchPDict(pdict1, chr3R)  # Search...

       ## Looking at the matches
       names(mindex)                        # Same as names(dict1).
       start_index <- startIndex(mindex)
       start_index
       length(start_index)                  # NOT the same as the input dictionary.
       start_index <- startIndex(mindex, all.names=TRUE)
       length(start_index)                  # Same as the input dictionary.
       countIndex(mindex)
       unlist(mindex)
       length(unlist(mindex))               # Total number of matches.
       all_matches <- extractAllMatches(chr3R, mindex)
       all_matches
       names(all_matches)

       ## ---------------------------------------------------------------------
       ## C. PERFORMANCE
       ## ---------------------------------------------------------------------

       ## If getting the number of matches is what matters only (without regarding
       ## their positions), then countPDict() will be faster, especially when there
       ## is a high number of matches
       count_index1 <- countPDict(pdict1, chr3R)
       identical(count_index1, count_index) # TRUE

       if (interactive()) {
         ## What's the impact of the dictionary width on performance?
         ## Below is some code that can be used to figure out (will take a long
         ## time to run). For different widths of the input dictionary, we look at:
         ##   o pptime: preprocessing time (in sec.) i.e. time needed for building
         ##             the PDict object from the truncated input sequences
         ##   o nnodes: nb of nodes in the resulting Aho-Corasick tree
         ##   o nupatt: nb of unique truncated input sequences
         ##   o matchtime: time (in sec.) needed to find all the matches
         ##   o totalcount: total number of matches
         getPDictStats <- function(dict0, width, subject)
         {
           list(
             width=width,
             pptime=system.time(pdict <- PDict(dict0, tb.end=width, drop.tail=TRUE))[["elapsed"]],
             nnodes=length(pdict@actree),
             nupatt=sum(pdict@dups == 0),
             matchtime=system.time(mindex <- matchPDict(pdict, subject))[["elapsed"]],
             totalcount=sum(countIndex(mindex))
           )
         }
         stats <- lapply(6:25, function(width) getPDictStats(dict0, width, chr3R))
         stats <- data.frame(do.call(rbind, stats))
         stats
       }

