-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | Linear regression between two samples, based on the 'statistics' package.
--   
--   Provides functions to perform a linear regression between 2 samples,
--   see the documentation of the linearRegression functions. This library
--   is based on the <a>statistics</a> package.
--   
--   <ul>
--   <li>0.3: you can now use all functions on any instance of the Vector
--   class (not just unboxed vectors).</li>
--   <li>0.2.4: added distribution estimations for standard regression
--   parameters.</li>
--   <li>0.2.3: added robust-fit support.</li>
--   <li>0.2.2: added the Total-Least-Squares version and made some
--   refactoring to eliminate code duplication</li>
--   <li>0.2.1: added the r-squared version and improved the
--   performances.</li>
--   </ul>
--   
--   Code sample:
--   
--   <pre>
--   import qualified Data.Vector.Unboxed as U
--   
--   test :: Int -&gt; IO ()
--   test k = do
--     let n = 10000000
--     let a = k*n + 1
--     let b = (k+1)*n
--     let xs = U.fromList [a..b]
--     let ys = U.map (\x -&gt; x*100 + 2000) xs
--     -- thus 100 and 2000 are the alpha and beta we want
--     putStrLn "linearRegression:"
--     print $ linearRegression xs ys
--   </pre>
--   
--   The r-squared and Total-Least-Squares versions work the same way.
@package statistics-linreg
@version 0.3

module Statistics.LinearRegression

-- | Simple linear regression between 2 samples. Takes two vectors Y={yi}
--   and X={xi} and returns (alpha, beta) such that Y = alpha + beta*X
linearRegression :: Vector v Double => v Double -> v Double -> (Double, Double)

-- | Simple linear regression between 2 samples. Takes two vectors Y={yi}
--   and X={xi} and returns (alpha, beta, r*r) such that Y = alpha + beta*X
--   and where r is the Pearson product-moment correlation coefficient
linearRegressionRSqr :: Vector v Double => v Double -> v Double -> (Double, Double, Double)

-- | Total Least Squares (TLS) linear regression. Assumes x-axis values
--   (and not just y-axis values) are random variables and that both
--   variables have similar distributions. interface is the same as
--   <a>linearRegression</a>.
linearRegressionTLS :: Vector v Double => v Double -> v Double -> (Double, Double)

-- | Pearson's product-moment correlation coefficient
correl :: Vector v Double => v Double -> v Double -> Double

-- | Covariance of two samples
covar :: Vector v Double => v Double -> v Double -> Double

-- | The error (or residual) mean square of a sample w.r.t. an estimated
--   regression line. This serves as an estimate for the variance of the
--   sampled data. Accepts the regression parameters (alpha,beta) and the
--   sample vectors X and Y.
linearRegressionMSE :: (Vector v Double, Vector v (Double, Double)) => (Double, Double) -> v Double -> v Double -> Double

-- | The estimated distributions of the regression parameters (alpha and
--   beta) assuming normal, identical distributions of Y, the sampled data.
--   These can serve to get confidence intervals for the regression
--   parameters. Accepts the regression parameters (alpha,beta) and the
--   sample vectors X and Y. The distributions are StudnetT distributions
--   centered at the estimated (alpha,beta) respectively, with parameter
--   numbers n-2 (where n is the initial sample size) and with standard
--   deviations that are extracted from the sampled data based on its MSE.
--   See chapter 2 of reference [3] for details.
linearRegressionDistributions :: (Vector v Double, Vector v (Double, Double)) => (Double, Double) -> v Double -> v Double -> (LinearTransform StudentT, LinearTransform StudentT)

-- | Finding a robust fit linear estimate between two samples. The
--   procedure requires randomization and is based on the procedure
--   described in the reference.
robustFit :: (MonadRandom m, Vector v Double) => EstimationParameters -> v Double -> v Double -> m EstimatedRelation

-- | A wrapper that executes <a>robustFit</a> using a default random
--   generator (meaning it is only pseudo-random)
nonRandomRobustFit :: Vector v Double => EstimationParameters -> v Double -> v Double -> EstimatedRelation

-- | Robust fit yielding also the R-square value of the "clean" dataset.
robustFitRSqr :: (MonadRandom m, Vector v Double, Vector v (Double, Double)) => EstimationParameters -> v Double -> v Double -> m (EstimatedRelation, Double)

-- | The robust fit algorithm used has various parameters that can be
--   specified using the <a>EstimationParameters</a> record.
data EstimationParameters
EstimationParameters :: !Double -> !Int -> !Int -> !Int -> !Int -> !Int -> Estimator -> ErrorFunction -> EstimationParameters

-- | Maximal fraction of outliers expected in the sample (default 0.25)
[outlierFraction] :: EstimationParameters -> !Double

-- | Number of concentration steps to take for initial evaluation of a
--   solution (default 3)
[shortIterationSteps] :: EstimationParameters -> !Int

-- | Maximal number of sampled subsets (pairs of points) to use as starting
--   points (default 500)
[maxSubsetsNum] :: EstimationParameters -> !Int

-- | If the initial sample is large, and thus gets subdivided, this is the
--   number of candidate-estimations to take from each subgroup, on which
--   complete convergence will be executed (default 10)
[groupSubsets] :: EstimationParameters -> !Int

-- | Maximal size of sample that can be analyzed without any sub-division
--   (default 600)
[mediumSetSize] :: EstimationParameters -> !Int

-- | Maximal size of sample that does not require two-step sub-division
--   (see reference article) (default 1500)
[largeSetSize] :: EstimationParameters -> !Int

-- | Estimator function to use (default linearRegression)
[estimator] :: EstimationParameters -> Estimator

-- | ErrorFunction to use (default linearRegressionError)
[errorFunction] :: EstimationParameters -> ErrorFunction

-- | An <a>ErrorFunction</a> is a function that computes the error of a
--   given point from an estimate. This module provides two error functions
--   correspoinding to the two <a>Estimator</a> functions it defines:
--   
--   <ul>
--   <li>Vertical distance squared via <a>linearRegressionError</a> that
--   should be used with <a>linearRegression</a></li>
--   <li>Total distance squared vie <a>linearRegressionTLSError</a> that
--   should be used with <a>linearRegressionTLS</a></li>
--   </ul>
type ErrorFunction = (EstimatedRelation -> (Double, Double) -> Double)

-- | An <a>Estimator</a> is a function that generates an estimated linear
--   regression based on 2 samples. This module provides two estimator
--   functions: <a>linearRegression</a> and <a>linearRegressionTLS</a>
type Estimator = (Sample -> Sample -> EstimatedRelation)

-- | An estimated linear relation between 2 samples is (alpha,beta) such
--   that Y = alpha + beta*X.
type EstimatedRelation = (Double, Double)

-- | Default set of parameters to use (see reference for details).
defaultEstimationParameters :: EstimationParameters

-- | linearRegression error function is the square of the <i>vertical</i>
--   distance of a point from the line.
linearRegressionError :: ErrorFunction

-- | linearRegressionTLS error function is the square of the <i>total</i>
--   distance of a point from the line.
linearRegressionTLSError :: ErrorFunction

-- | Calculate the optimal (local minimum) estimate based on an initial
--   estimate. The local minimum may not be the global (a.k.a. best)
--   estimate but starting from enough different initial estimates should
--   yield the global optimum eventually.
converge :: Vector v Double => EstimationParameters -> v Double -> v Double -> EstimatedRelation -> EstimatedRelation
