readCdfUnitsWriteMap       package:affxparser       R Documentation

_G_e_n_e_r_a_t_e_s _a_n _A_f_f_y_m_e_t_r_i_x _c_e_l_l-_i_n_d_e_x _w_r_i_t_e _m_a_p _f_r_o_m _a _C_D_F _f_i_l_e

_D_e_s_c_r_i_p_t_i_o_n:

     Generates an Affymetrix cell-index write map from a CDF file.

     The purpose of this method is to provide a re-ordering of cell
     elements such that cells in units (probesets) can be stored in
     contigous blocks. When reading cell elements unit by unit, minimal
     file re-position is required resulting in a faster reading.

     Note: At the moment does this package not provide methods to
     write/reorder CEL files.  In the meanwhile, you have to write and
     re-read using your own file format.  That's not too hard using
     'writeBin()' and 'readBin'().

_U_s_a_g_e:

     readCdfUnitsWriteMap(filename, units=NULL, ..., verbose=FALSE)

_A_r_g_u_m_e_n_t_s:

filename: The pathname of the CDF file.

   units: An 'integer' 'vector' of unit indices specifying which units
          to listed first.  All other units are added in order at the
          end. If 'NULL', units are in order.

     ...: Additional arguments passed to 'readCdfUnits'().

 verbose: Either a 'logical', a 'numeric', or a 'Verbose' object
          specifying how much verbose/debug information is written to
          standard output. If a Verbose object, how detailed the
          information is is specified by the threshold level of the
          object. If a numeric, the value is used to set the threshold
          of a new Verbose object. If 'TRUE', the threshold is set to
          -1 (minimal). If 'FALSE', no output is written (and neither
          is the 'R.utils' package required).

_V_a_l_u_e:

     A 'integer' 'vector' which is a _write_ map.

_A_u_t_h_o_r(_s):

     Henrik Bengtsson (<URL: http://www.braju.com/R/>)

_S_e_e _A_l_s_o:

     To invert maps, see 'invertMap'(). 'readCel'() and
     'readCelUnits'().

_E_x_a_m_p_l_e_s:

       
     for (zzz in 0) {

     # Find any CDF file
     cdfFile <- findCdf()
     if (is.null(cdfFile))
       break;

     # Create a cell-index map (for writing)
     writeMap <- readCdfUnitsWriteMap(cdfFile)

     # Inverse map to be used to read cell elements such that, when read
     # read unit by unit, they are read much faster.
     readMap <- invertMap(writeMap)

     # Validate the two maps
     stopifnot(identical(readMap[writeMap], 1:length(readMap)))

     cat("Summary of the \"randomness\" of the cell indices:\n")
     moves <- diff(readMap) - 1
     cat(sprintf("Number of unnecessary file re-positioning: %d (%.1f%%)\n",
                        sum(moves != 0), 100*sum(moves != 0)/length(moves)))
     cat(sprintf("Extra positioning: %.1fGb\n", sum(abs(moves))/1024^3))

     smallMoves <- moves[abs(moves) <= 25];
     largeMoves <- moves[abs(moves)  > 25];
     layout(matrix(1:2))
     main <- "Non-signed file moves required in unorded file"
     hist(smallMoves, nclass=51, main=main, xlab="moves <=25 bytes")
     hist(largeMoves, nclass=101, main="", xlab="moves >25 bytes")

     # Clean up
     layout(1)
     rm(cdfFile, readMap, writeMap, moves, smallMoves, largeMoves, main)

     } # for (zzz in 0)

       
     # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     # Function to read Affymetrix probeset annotations
     # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     readAffymetrixProbesetAnnotation <- function(pathname, ...) {
       # Get headers
       header <- scan(pathname, what="character", sep=",", quote="\"",
                                                           quiet=TRUE, nlines=1);

       # Read only a subset of columns (unique to this example)
       cols <- c("Probe Set ID"="probeSet",
                 "Chromosome"="chromosome",
                 "Physical Position"="physicalPosition",
                 "dbSNP RS ID"="dbSnpId");

       colClasses <- rep("NULL", length(header));
       colClasses[header %in% names(cols)] <- "character";

       # Read the data (this is what takes time)
       df <- read.table(pathname, colClasses=colClasses, header=TRUE, sep=",",
              quote="\"", na.strings="---", strip.white=TRUE, check.names=FALSE,
                      blank.lines.skip=FALSE, fill=FALSE, comment.char="", ...);

       # Re-order columns
       df <- df[,match(names(cols),colnames(df))];
       colnames(df) <- cols;

       # Use "Probe Set ID" as rownames. Note that if we use 'row.names=1'
       # or similar something goes wrong. /HB 2006-03-06
       rownames(df) <- df[[1]];
       df <- df[,-1]; 

       # Change types of columns
       df[[1]] <- factor(df[[1]], levels=c(1:22,"X","Y",NA), ordered=TRUE);
       df[[2]] <- as.integer(df[[2]]);

       df;
     } # readAffymetrixProbesetAnnotation()


     # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     # Main
     # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     for (zzz in 0) {

     # Chip to be remapped
     chipType <- "Mapping50K_Xba240"

     annoFile <- paste(chipType, "_annot.csv", sep="")
     cdfFile <- findCdf(chipType)
     if (is.null(cdfFile) || !file.exists(annoFile))
       break;

     # Read SNP location details
     snpInfo <- readAffymetrixProbesetAnnotation(annoFile)

     # Order by chromsome and then physical position
     o <- order(snpInfo[[1]], snpInfo[[2]])
     snpInfo <- snpInfo[o,]
     rm(o)

     # Read unit names in CDF file
     unitNames <- readCdfUnitNames(cdfFile)

     # The CDF unit indices sorted by chromsomal position
     units <- match(rownames(snpInfo), unitNames)

     # ...and cell indices in the same order
     writeMap <- readCdfUnitsWriteMap(cdfFile, units=units)

     # Inverse map to be used to write cell elements such that, if they
     # later are read unit by unit, they are read in contiguous blocks.
     readMap <- invertMap(writeMap)

     # Clean up
     rm(chipType, annoFile, cdfFile, snpInfo, unitNames, units, readMap, writeMap)

     } # for (zzz in 0)

      

