\name{getIntensityMatrix}
\alias{getIntensityMatrix}
\title{Extract a matrix of microarray intensities from various kinds of objects}
\description{Extract a matrix of microarray intensities from various
  kinds of objects. This function is called by the function
  \code{\link[vsn:vsn]{vsn}}, and is normally not called by the user.}
\usage{getIntensityMatrix (intensities, verbose)}
\arguments{
\item{intensities}{Object of class
  \code{\link[Biobase:exprSet-class]{exprSet}}
  with raw intensity values from a microarray experiment. Alternatively,
  this may be a \code{\link{matrix}}, a \code{\link{data.frame}} with
  all numeric columns or an
  \code{\link[marray-Classes:marrayRaw-class]{marrayRaw}} object.}
\item{verbose}{Logical. If TRUE, some messages are printed.}
}
\value{
  A numeric matrix.
}

\author{Wolfgang Huber \url{http://www.dkfz.de/mga/whuber}}
\seealso{\code{\link[vsn:vsn]{vsn}}}
\examples{
  ## see the vignette
}
\keyword{manip}

\eof
\name{kidney}
\alias{kidney}
\docType{data}
\title{Intensity data for 1 cDNA slide with two adjacent tissue samples
  from a nephrectomy (kidney)}
\description{Intensity data for 1 cDNA slide with two adjacent tissue samples
  from a nephrectomy (kidney)}
\usage{data(kidney)}

\format{\code{kidney} is an exprSet containing the data from 1 cDNA
  chip. The 8704x2 matrix \code{exprs(kidney)} contains the
  spot intensities for the red (635 nm) and green color channels
  (532 nm) respectively. For each spot, a background estimate from a
  surrounding region was subtracted.
}

\details{The chip was produced in 2001 by Holger Sueltmann at
  the Division of Molecular Genome Analysis at the German Cancer
  Research Center in Heidelberg.}

\references{Huber W, Boer JM, von Heydebreck A, Gunawan B, Vingron M,
  Fuzesi L, Poustka A, Sueltmann H. Transcription profiling of renal
  cell carcinoma. Verh Dtsch Ges Pathol. 2002;86:153-64.
  PMID: 12647365}

\seealso{\code{\link{vsn}}}
\examples{
 data(kidney)
 plot(exprs(kidney), pch=".", log="xy")
 abline(a=0,b=1,col="blue")  
}
\keyword{datasets}

\eof
\name{log.na}
\alias{log.na}
\title{Wrapper around the log() function that avoids warnings
  when negative values are presented}
\description{Wrapper around the log() function that avoids warnings
  when negative values are presented}
\usage{
log.na(x, ...)
}
\arguments{
  \item{x}{A numeric or complex vector.}
  \item{...}{Further arguments that get passed on to
    \code{\link{log}}.}
}

\value{A vector of the same length as `x' containing the transformed
  values.  For x<=0, NA is returned without warning.}

\author{Wolfgang Huber \url{http://www.dkfz.de/mga/whuber}}
\seealso{\code{\link{log}}}

\examples{
   log.na(-3:3)
}

\keyword{math}

\eof
\name{lymphoma}
\alias{lymphoma}
\docType{data}
\title{Intensity data for 8 cDNA slides with CLL and DLBL samples from
  the Alizadeh et al. paper in Nature 2000}
\description{8 cDNA chips from Alizadeh lymphoma paper}
\usage{data(lymphoma)}

\format{\code{lymphoma} is an \code{\link[Biobase:exprSet-class]{exprSet}}
  containing the data from 8 chips
  from the lymphoma data set by Alizadeh et al. (see references). Each
  chip represents two samples: on color channel 1 (CH1, Cy3, green) the
  common reference sample, and on color channel 2 (CH2, Cy5, red) the
  various disease samples. See \code{pData(lymphoma)}. The 9216x16
  matrix \code{exprs(lymphoma)} contains the background-subtracted spot
  intensities (CH1I-CH1B and CH2I-CH2B, respectively). 
}

\details{
The chip intensity files were downloaded from the Stanford
microarray database. Starting from the link below, this was done by
following the links \emph{Published Data} -> 
\emph{Alizadeh AA, et al. (2000) Nature 403(6769):503-11} -> 
\emph{Data in SMD} -> \emph{Display Data}, and selecting the following 
8 slides:
\tabular{l}{
lc7b019\cr
lc7b047\cr
lc7b048\cr
lc7b056\cr
lc7b057\cr
lc7b058\cr
lc7b069\cr
lc7b070
}
Then, the script \code{makedata.R} from the \code{scripts} subdirectory
of this package was run to generate the \R data object.
}

\references{A. Alizadeh et al., Distinct types of diffuse large B-cell 
lymphoma identified by gene expression profiling. Nature 403(6769):503-11, 
Feb 3, 2000.}

\source{http://genome-www5.stanford.edu/MicroArray/SMD}

\seealso{\code{\link{vsn}}}
\examples{
   data(lymphoma)
   lymphoma
   pData(lymphoma)
}

\keyword{datasets}

\eof
\name{meanSdPlot}
\alias{meanSdPlot}
\title{Plot row standard deviations versus row means}
\description{Plot row standard deviations versus row means}
\usage{
meanSdPlot(x, 
           ranks = TRUE,
           xlab  = ifelse(ranks, "rank(mean)", "mean"),
           ylab  = "sd",
           pch   = ".",
           col, ...)} 
\arguments{
\item{x}{An object of class \code{\link[base:matrix]{matrix}} or
  \code{\link[Biobase:exprSet-class]{exprSet}}}
\item{ranks}{Logical, indicating whether the x-axis (means) should be plotted
  on the original scale (FALSE) or on the rank scale (TRUE). The latter
  distributes the data more evenly along the x-axis and allows a
  better visual assessment of the standard deviation as a function of 
  the mean.}
\item{xlab}{Character, label for the x-axis.}
\item{ylab}{Character, label for the y-axis.}
\item{pch}{Plot symbol.}
\item{col}{Color of plotted points. See details.}
\item{...}{Further arguments that get passed to plot.default.}
}

\details{Standard deviation and mean are calculated row-wise from the
  matrix \code{exprs(x)}. The scatterplot of these versus each other
  allows to visually verify whether there is
  a dependence of the standard deviation (or variance) on the mean.
  The red dots depict the running median estimator (window-width 10\%).
  If there is no variance-mean dependence, then the line formed by the
  red dots should be approximately horizontal.

  If the \code{preprocessing} slot of the \code{description} slot of
  \code{x} is a \code{\link{list}} and contains an element named
  \code{vsnTrimSelection}, then the coloring of the points reflects
  the trimming that was used in the least trimmed sum of squares (LTS)
  estimation (see \code{\link{vsn}}). If the condition does not apply,
  and \code{col} is \code{NULL}, the points are drawn in black. If
  \code{col} is not \code{NULL}, its value is used for the coloring of
  the points.
}

\value{
  The function is called for its side effect, creating a plot on the
  active graphics device.
}

\author{Wolfgang Huber \url{http://www.dkfz.de/abt0840/whuber}}
\seealso{\code{\link{vsn}}}
\examples{
  data(kidney)
  exprs(kidney) = log.na(exprs(kidney))

  meanSdPlot(kidney)

  ## ...try this out with non-logged data, the lymphoma data, your data...
}

\keyword{hplot}

\eof
\name{nchoosek}
\alias{nchoosek}
\title{List all subsets of size k from n objects}
\description{List all subsets of size k from n objects}
\usage{nchoosek(n, k)}
\arguments{
  \item{n}{Integer}
  \item{k}{Integer}
}

\value{  
A matrix with k rows and as many columns as there are distinct
subsets. Each column contains integers between 1 and n, and represents a
subset. }

\seealso{\code{\link{choose}}}
\examples{
nchoosek(5,3)
}

\keyword{math}

\eof
\name{normalize.AffyBatch.vsn}
\alias{normalize.AffyBatch.vsn}
\title{Wrapper for vsn to be used as a normalization method in the package affy}
\description{Wrapper for \code{\link{vsn}} to be used as a normalization method in the package affy}
\usage{normalize.AffyBatch.vsn(abatch, subsample=20000, niter = 4, ...)}
\arguments{
  \item{abatch}{An object of type \code{\link[affy:AffyBatch-class]{AffyBatch}}.}
  \item{subsample}{The number of probes to be sampled for the fit of the transformation parameters.}
  \item{niter}{Parameter passed on to \code{\link{vsn}}.}
  \item{...}{Further parameters for \code{\link{vsn}}.}
}

\details{Please refer to the "details" and "references" sections of the man page for
  \code{\link{vsn}} for more details about this method.

  \bold{Important note}: after calling \code{\link{vsn}}, the function
  \code{normalize.AffyBatch.vsn} \bold{exponentiates} the data. This is done in
  order to make the behavior of this function similar to the other
  normalization methods in affy. There, it is assumed that in subsequent
  analysis steps (e.g. in \code{\link{medianpolish}}), the logarithm to
  base 2 needs to be taken.
}

\value{An object of class \code{\link[affy:AffyBatch-class]{AffyBatch}}.}
\author{Wolfgang Huber \url{mailto:w.huber@dkfz.de}}

\seealso{\code{\link{vsn}}}
\examples{
library(affy)
library(affydata)
data(Dilution)

## let affy know about vsn
normalize.AffyBatch.methods <- c(normalize.AffyBatch.methods, "vsn")

es1 = expresso(Dilution[1:2], 
               bg.correct       = FALSE,   ## bg correction is done by vsn
               normalize.method = "vsn",
               pmcorrect.method = "pmonly", 
               summary.method   = "medianpolish")

es2 = expresso(Dilution[1:2], 
               bgcorrect.method = "rma",
               normalize.method = "quantiles", 
               pmcorrect.method = "pmonly",
               summary.method   = "medianpolish")

## graphics output
if(interactive()) x11()
oldpar = par(mfrow=c(2,2), pch=".")

## extract expression values
x1 = exprs(es1)
x2 = exprs(es2) 
 
## scatter plot
plot(x1, main="vsn: chip 3 vs 4")
plot(x2, main="rma: chip 3 vs 4")

## rank(mean) - difference plot
ylim = c(-0.7, 0.7)
plot(rank(rowSums(x1)), diff(t(x1)), ylim=ylim, main="rank(mean) vs differences")
abline(h=0, col="red")

plot(rank(rowSums(x2)), diff(t(x2)), ylim=ylim, main="rank(mean) vs differences")
abline(h=0, col="red")

## reset old plotting parameters 
par(oldpar)
}

\keyword{robust}

\eof
\name{rowSds}
\alias{rowSds}
\title{Row standard deviation of a numeric array}
\description{
  Row standard deviation of a numeric array
}
\usage{
rowSds(x, ...)
}
\arguments{
  \item{x}{An array of two or more dimensions, containing numeric,
          complex, integer or logical values, or a numeric data frame.}
  \item{...}{Further arguments that get passed on to
    \code{\link{rowMeans}} and \code{\link{rowSums}}.}
}

\value{
  A numeric or complex array of suitable size, or a vector if the
  result is one-dimensional.  The `dimnames' (or `names' for a
  vector result) are taken from the original array.
}
\details{This a convenience function, the main work is done in
  \code{\link{rowMeans}} and \code{\link{rowSums}}. See the function
  definition, it is very simple.
}
\author{Wolfgang Huber \url{http://www.dkfz.de/mga/whuber}}
\seealso{\code{\link{rowMeans}} and \code{\link{rowSums}}}

\examples{
   a = matrix(rnorm(1e4), nrow=10)
   rowSds(a)
}

\keyword{array}
\keyword{manip}

\eof
\name{sagmbSimulateData}
\alias{sagmbSimulateData}
\alias{sagmbAssess}
\title{Simulate data and assess vsn's parameter estimation}
\description{Functions to validate and assess the performance of vsn
  through simulation of data.}
\usage{
sagmbSimulateData(n=8064, d=2, de=0, up=0.5)
sagmbAssess(h1, sim)
}
\arguments{
  \item{n}{Numeric. Number of probes (rows).}
  \item{d}{Numeric. Number of arrays (columns).}
  \item{de}{Numeric. Fraction of differentially expressed genes.}
  \item{up}{Numeric. Fraction of up-regulated genes among the differentially expressed genes.}
  \item{h1}{Matrix. Calibrated and transformed data, according, e.g., to
    vsn}
  \item{sim}{List. The output of a previous call to
    \code{sagmbSimulateData}, see Value}
}

\value{
  For \code{sagmbSimulateData}, a list with three components:
  \code{hy}, an \code{n x d} matrix with the true (=simulated)
  calibrated, transformed data;
  \code{y}, an \code{n x d} matrix with the simulated
  uncalibrated raw data - this is intended to be fed into
  \code{\link[vsn:vsn]{vsn}};
  \code{is.de}, a logical vector of length \code{n}, specifying
  which probes are simulated to be differentially expressed.

  For \code{sagmbSimulateData}, a number: the root mean squared
  difference between true and estimated transformed data.
}
\details{Please see the vignette.}

\references{Wolfgang Huber, Anja von Heydebreck, Holger Sueltmann,
  Annemarie Poustka, and Martin Vingron (2003)
  "Parameter estimation for the calibration and variance stabilization
  of microarray data",
  Statistical Applications in Genetics and Molecular Biology:
  Vol. 2: No. 1, Article 3.
  http://www.bepress.com/sagmb/vol2/iss1/art3}

\author{Wolfgang Huber \url{http://www.dkfz.de/mga/whuber}}
\seealso{ \code{\link{vsn}} }

\examples{
  sim <- sagmbSimulateData()
  ny  <- vsn(sim$y)
  res <- sagmbAssess(exprs(ny), sim)
  res
}

\keyword{datagen}

\eof
\name{vsn}
\alias{vsn}
\title{Variance stabilization and calibration for microarray data. }
\description{Robust estimation of variance-stabilizing and calibrating 
  transformations for microarray data. This is the main function of
  this package; see also the vignette vsn.pdf.}
\usage{
vsn(intensities,
    lts.quantile = 0.5,
    verbose      = TRUE,
    niter        = 10,
    cvg.check    = NULL,
    pstart       = NULL,
    describe.preprocessing = TRUE)
}
\arguments{
\item{intensities}{An object that contains intensity values from
  a microarray experiment. See
  \code{\link[vsn:getIntensityMatrix]{getIntensityMatrix}} for details.
  The intensities are assumed to be the raw
  scanner data, summarized over the spots by an image analysis program,
  and possibly "background" subtracted.
  The intensities must not be logarithmically or otherwise transformed,
  and not thresholded or "floored". NAs are not accepted.
  See details.}
\item{lts.quantile}{Numeric. The quantile that is used for the resistant
  least trimmed sum of squares regression. Allowed values are between
  0.5 and 1, corresponding to least median sum of squares regression,
  and to ordinary least sum of squares regression, respectively.}
\item{niter}{Integer. The number of iterations to be used in the least
  trimmed sum of squares regression.}
\item{verbose}{Logical. If TRUE, some messages are printed.}
\item{pstart}{Numeric vector. Starting values for the model parameters
  in the iterative parameter estimation algorithm. If NULL, the function
  tries to determine reasonable starting values from the distribution of
  \code{intensities}.}
\item{describe.preprocessing}{Logical. If TRUE, calibration and
  transformation parameters, plus some other information are stored in
  the \code{preprocessing} slot of the returned object. See details.}
\item{cvg.check}{List. If non-NULL, this allows finer control of the
  iterative least trimmed sum of squares regression. See details.}
}
\details{The function calibrates for sample-to-sample variations through
  shifting and scaling, and transforms the intensities to a scale where
  the variance is approximately independent of the mean intensity.
  The variance stabilizing transformation is equivalent to the
  natural logarithm in the high-intensity range, and to a
  linear transformation in the low-intensity range. In an intermediate
  range, the \emph{arsinh} function interpolates smoothly between the
  two. The calibration consists of estimating an offset \code{offs[i]}
  and a scale factor \code{fac[i]} for each column \code{i} of the matrix
  \code{intensities}. Thus, the calibration is:
  
  \code{intensities[k,i] <- intensities[k,i] * fac[i] + offs[i]}

  The parameters \code{offs[i]} and \code{fac[i]} are estimated through
  a robust variant of maximum likelihood. The model assumes that for
  the majority of genes the expression levels are not much different
  across the samples, i.e., that only a minority of genes (less than
  a fraction of \code{lts.quantile}) is differentially expressed.

  \bold{Format:} The format of the matrix of intensities is as follows:
  for the \bold{two-color printed array technology}, each row
  corresponds to one spot, and the columns to the different arrays
  and wave-lengths (usually red and green, but could be any number).
  For example, if there are 10 arrays, the matrix would have 20 columns,
  columns 1...10 containing the green intensities, and 11...20 the
  red ones. In fact, the ordering of the columns does not matter to
  \code{vsn}, but it is your responsibility to keep track of it for
  subsequent analyses.
  For \bold{one-color arrays}, each row corresponds to a probe, and each
  column to an array.

  \bold{Performance:} This function is slow. That is due to the nested
  iteration loops of the numerical optimization of the likelihood function
  and the heuristic that identifies the non-outlying data points in the
  least trimmed squares regression. For large arrays with many tens of
  thousands of probes, you may want to consider random subsetting: that is,
  only use a subset of the e.g. 10-20,000 rows of the data matrix
  \code{intensities} to fit the parameters, then apply the transformation
  to all the data, using \code{\link{vsnh}}. An example for this can be
  seen in the function \code{\link{normalize.AffyBatch.vsn}}, whose code
  you can inspect by typing \code{normalize.AffyBatch.vsn} on the R
  command line.

  \bold{Calibration and transformation parameters:} The parameters
  are stored in the \code{preprocessing} slot of the \code{description}
  slot of the \code{\link[Biobase:exprSet-class]{exprSet}} object that
  is returned, in the form of a \code{\link{list}} with three elements
  \itemize{
    \item \code{vsnParams}: a length(2*d) numeric vector of parameters 
    \item \code{vsnParamsIter}: an (2*d) x niter numeric matrix that
    contains the parameter trajectory during the
    iterative fit process (see \code{\link{vsnPlotPar}}).
    \item \code{vsnTrimSelection}: a length(n) logical vector that for
    each row of the intensities matrix reports whether it was below
    (TRUE) or above (FALSE) the trimming threshold.
    }

  If \code{intensities} has class
  \code{\link[Biobase:exprSet-class]{exprSet}}, and its \code{description} 
  slot has class \code{\link[Biobase:MIAME-class]{MIAME}}, then this
  list is appended to any existing entries in the \code{preprocessing}
  slot. Otherwise, the \code{description} object and its
  \code{preprocessing} slot are created.

  By default, if \code{cvg.check} is \code{NULL}, the function will run
  the fixed number \code{niter} of iterations in the least trimmed sum
  of squares regression. More fine-grained control can be obtained by
  passing a list with elements \code{eps} and \code{n}. If the maximum
  change between transformed data values is smaller than \code{eps} for
  \code{n} subsequent iterations, then the iteration terminates.
}

\value{An object of class \code{\link[Biobase:exprSet-class]{exprSet}}.
  Differences
  between the columns of the transformed intensities may be interpreted
  as "regularized" or "shrunken" log-ratios. For the calibration and
  transformation parameters, see the \emph{Details} section.
}

\references{Variance stabilization applied to microarray data
calibration and to the quantification of differential expression,
Wolfgang Huber, Anja von Heydebreck, Holger Sueltmann, Annemarie
Poustka, Martin Vingron; Bioinformatics (2002) 18 Suppl.1 S96-S104.

Parameter estimation for the calibration and variance stabilization 
of microarray data, 
Wolfgang Huber, Anja von Heydebreck, Holger Sueltmann, 
Annemarie Poustka, and Martin Vingron;  
Statistical Applications in Genetics and Molecular Biology (2003)
Vol. 2 No. 1, Article 3.
http://www.bepress.com/sagmb/vol2/iss1/art3.}

\author{Wolfgang Huber \url{http://www.dkfz.de/mga/whuber}}

\seealso{\code{\link{exprSet-class}}, \code{\link{MIAME-class}},
  \code{\link{normalize.AffyBatch.vsn}}}

\examples{
data(kidney)

if(interactive()) {
  x11(width=9, height=4.5)
  par(mfrow=c(1,2))
}
plot(log.na(exprs(kidney)), pch=".", main="log-log")

vsnkid = vsn(kidney)   ## transform and calibrate
plot(exprs(vsnkid), pch=".", main="h-h")

if (interactive()) {
  x11(width=9, height=4)
  par(mfrow=c(1,3))
}

meanSdPlot(vsnkid)
vsnPlotPar(vsnkid, "factors")
vsnPlotPar(vsnkid, "offsets")

## this should always hold true
params = preproc(description(vsnkid))$vsnParams
stopifnot(all(vsnh(exprs(kidney), params) == exprs(vsnkid))) 
}

\keyword{robust}

\eof
\name{vsnPlotPar}
\alias{vsnPlotPar}
\title{Plot trajectories of calibration and transformation parameters for
  a vsn fit}
\description{Plot trajectories of calibration and transformation parameters for
  a vsn fit}
\usage{vsnPlotPar(x, what, xlab="iter", ylab=what, ...)} 
\arguments{
  \item{x}{An object of class \code{\link{exprSet-class}} which has been
    created by the function \code{\link{vsn}}.}
  \item{what}{Character, should either be "factors" or "offsets".}
  \item{xlab}{Character, label for the x-axis.}
  \item{ylab}{Character, label for the y-axis.}
  \item{...}{Further arguments that get passed to plot.default.}
}
\details{The plot that is created by this function may help in assessing
  whether the parameter estimation in \code{\link{vsn}} was sufficiently
  converged.}
\value{
  The function is called for its side effect, creating a plot on the
  active graphics device.
}
\author{Wolfgang Huber \url{http://www.dkfz.de/abt0840/whuber}}
\seealso{\code{\link{vsn}}}
\examples{
  ## see example for vsn
}
\keyword{hplot}



\eof
\name{vsnh}
\alias{vsnh}
\title{A function that transforms a matrix of microarray intensities}
\description{A function that transforms a matrix of microarray intensities}
\usage{vsnh(y, p)}
\arguments{
  \item{y}{A numeric matrix containing intensity values from an array experiment. 
    It may contain NA values.}
  \item{p}{A numeric vector with the transformation parameters, of length 2*ncol(y). 
    NA values are not allowed. See Details.}
}

\value{
  A numeric matrix of the same size as y, with the transformed data.
}
\details{
  The transformation is defined as:
  
  vsnh(y,p)[k,i] = asinh( p[i] + p[i+d] * y[k,i] )
  
  where d=ncol(y) is the number of samples, 
  i=1:d counts over the samples,
  p[1:d] are the additive calibration offsets,
  p[(d+1):(2*d)] are the calibration factors,
  and k=1:nrow(y) counts over the probes.
  
}
\references{Variance stabilization applied to microarray data
calibration and to the quantification of differential expression,
Wolfgang Huber, Anja von Heydebreck, Holger Sueltmann, Annemarie
Poustka, Martin Vingron; Bioinformatics (2002) 18 Suppl.1 S96-S104.}

\author{Wolfgang Huber \url{http://www.dkfz.de/mga/whuber}}
\seealso{ \code{\link{vsn}} }

\examples{
##see vsn
}

\keyword{print}

\eof
