Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cluster.cpp File Reference
#include "oldheap.h"
#include "const.h"
#include "cluster.h"
#include "emalloc.h"
#include "helpers.h"
#include "matrix.h"
#include "tprintf.h"
#include "danerror.h"
#include "freelist.h"
#include <math.h>

Go to the source code of this file.

Classes

struct  TEMPCLUSTER
 
struct  STATISTICS
 
struct  BUCKETS
 
struct  CHISTRUCT
 
struct  ClusteringContext
 

Macros

#define HOTELLING   1
 
#define FTABLE_X   10
 
#define FTABLE_Y   100
 
#define MINVARIANCE   0.0004
 
#define MINSAMPLESPERBUCKET   5
 
#define MINSAMPLES   (MINBUCKETS * MINSAMPLESPERBUCKET)
 
#define MINSAMPLESNEEDED   1
 
#define BUCKETTABLESIZE   1024
 
#define NORMALEXTENT   3.0
 
#define Odd(N)   ((N)%2)
 
#define Mirror(N, R)   ((R) - (N) - 1)
 
#define Abs(N)   ( ( (N) < 0 ) ? ( -(N) ) : (N) )
 
#define SqrtOf2Pi   2.506628275
 
#define LOOKUPTABLESIZE   8
 
#define MAXDEGREESOFFREEDOM   MAXBUCKETS
 
#define MAXNEIGHBORS   2
 
#define MAXDISTANCE   MAX_FLOAT32
 
#define CHIACCURACY   0.01
 
#define MINALPHA   (1e-200)
 
#define INITIALDELTA   0.1
 
#define DELTARATIO   0.1
 
#define ILLEGAL_CHAR   2
 

Typedefs

typedef FLOAT64(* DENSITYFUNC )(inT32)
 
typedef FLOAT64(* SOLVEFUNC )(CHISTRUCT *, double)
 

Functions

void CreateClusterTree (CLUSTERER *Clusterer)
 
void MakePotentialClusters (ClusteringContext *context, CLUSTER *Cluster, inT32 Level)
 
CLUSTERFindNearestNeighbor (KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
 
CLUSTERMakeNewCluster (CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
 
inT32 MergeClusters (inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
 
void ComputePrototypes (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
PROTOTYPEMakePrototype (CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
 
PROTOTYPEMakeDegenerateProto (uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, inT32 MinSamples)
 
PROTOTYPETestEllipticalProto (CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPEMakeSphericalProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
 
PROTOTYPEMakeEllipticalProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
 
PROTOTYPEMakeMixedProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, FLOAT64 Confidence)
 
void MakeDimRandom (uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
 
void MakeDimUniform (uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics)
 
STATISTICSComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
 
PROTOTYPENewSphericalProto (uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewEllipticalProto (inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewMixedProto (inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewSimpleProto (inT16 N, CLUSTER *Cluster)
 
BOOL8 Independent (PARAM_DESC ParamDesc[], inT16 N, FLOAT32 *CoVariance, FLOAT32 Independence)
 
BUCKETSGetBuckets (CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
 
BUCKETSMakeBuckets (DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
 
uinT16 OptimumNumberOfBuckets (uinT32 SampleCount)
 
FLOAT64 ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
 
FLOAT64 NormalDensity (inT32 x)
 
FLOAT64 UniformDensity (inT32 x)
 
FLOAT64 Integral (FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx)
 
void FillBuckets (BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
 
uinT16 NormalBucket (PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
 
uinT16 UniformBucket (PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
 
BOOL8 DistributionOK (BUCKETS *Buckets)
 
void FreeStatistics (STATISTICS *Statistics)
 
void FreeBuckets (BUCKETS *Buckets)
 
void FreeCluster (CLUSTER *Cluster)
 
uinT16 DegreesOfFreedom (DISTRIBUTION Distribution, uinT16 HistogramBuckets)
 
int NumBucketsMatch (void *arg1, void *arg2)
 
int ListEntryMatch (void *arg1, void *arg2)
 
void AdjustBuckets (BUCKETS *Buckets, uinT32 NewSampleCount)
 
void InitBuckets (BUCKETS *Buckets)
 
int AlphaMatch (void *arg1, void *arg2)
 
CHISTRUCTNewChiStruct (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
 
FLOAT64 Solve (SOLVEFUNC Function, void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
 
FLOAT64 ChiArea (CHISTRUCT *ChiParams, FLOAT64 x)
 
BOOL8 MultipleCharSamples (CLUSTERER *Clusterer, CLUSTER *Cluster, FLOAT32 MaxIllegal)
 
double InvertMatrix (const float *input, int size, float *inv)
 
CLUSTERERMakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[])
 
SAMPLEMakeSample (CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
 
LIST ClusterSamples (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void FreeClusterer (CLUSTERER *Clusterer)
 
void FreeProtoList (LIST *ProtoList)
 
void FreePrototype (void *arg)
 
CLUSTERNextSample (LIST *SearchState)
 
FLOAT32 Mean (PROTOTYPE *Proto, uinT16 Dimension)
 
FLOAT32 StandardDeviation (PROTOTYPE *Proto, uinT16 Dimension)
 
inT32 MergeClusters (inT16 N, PARAM_DESC ParamDesc[], inT32 n1, inT32 n2, FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[])
 

Variables

const double FTable [FTABLE_Y][FTABLE_X]
 

Macro Definition Documentation

#define Abs (   N)    ( ( (N) < 0 ) ? ( -(N) ) : (N) )

Definition at line 204 of file cluster.cpp.

#define BUCKETTABLESIZE   1024

Definition at line 159 of file cluster.cpp.

#define CHIACCURACY   0.01
#define DELTARATIO   0.1
#define FTABLE_X   10

Definition at line 30 of file cluster.cpp.

#define FTABLE_Y   100

Definition at line 31 of file cluster.cpp.

#define HOTELLING   1

Definition at line 29 of file cluster.cpp.

#define ILLEGAL_CHAR   2
#define INITIALDELTA   0.1
#define LOOKUPTABLESIZE   8

Definition at line 224 of file cluster.cpp.

#define MAXDEGREESOFFREEDOM   MAXBUCKETS

Definition at line 225 of file cluster.cpp.

#define MAXDISTANCE   MAX_FLOAT32
#define MAXNEIGHBORS   2
#define MINALPHA   (1e-200)
#define MINSAMPLES   (MINBUCKETS * MINSAMPLESPERBUCKET)

Definition at line 150 of file cluster.cpp.

#define MINSAMPLESNEEDED   1

Definition at line 151 of file cluster.cpp.

#define MINSAMPLESPERBUCKET   5

Definition at line 149 of file cluster.cpp.

#define MINVARIANCE   0.0004

Definition at line 141 of file cluster.cpp.

#define Mirror (   N,
 
)    ((R) - (N) - 1)

Definition at line 203 of file cluster.cpp.

#define NORMALEXTENT   3.0

Definition at line 160 of file cluster.cpp.

#define Odd (   N)    ((N)%2)

Definition at line 202 of file cluster.cpp.

#define SqrtOf2Pi   2.506628275

Definition at line 214 of file cluster.cpp.

Typedef Documentation

typedef FLOAT64(* DENSITYFUNC)(inT32)

Definition at line 199 of file cluster.cpp.

typedef FLOAT64(* SOLVEFUNC)(CHISTRUCT *, double)

Definition at line 200 of file cluster.cpp.

Function Documentation

void AdjustBuckets ( BUCKETS Buckets,
uinT32  NewSampleCount 
)

Definition at line 2357 of file cluster.cpp.

2357  {
2358 /*
2359  ** Parameters:
2360  ** Buckets histogram data structure to adjust
2361  ** NewSampleCount new sample count to adjust to
2362  ** Operation:
2363  ** This routine multiplies each ExpectedCount histogram entry
2364  ** by NewSampleCount/OldSampleCount so that the histogram
2365  ** is now adjusted to the new sample count.
2366  ** Return: none
2367  ** Exceptions: none
2368  ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2369  */
2370  int i;
2371  FLOAT64 AdjustFactor;
2372 
2373  AdjustFactor = (((FLOAT64) NewSampleCount) /
2374  ((FLOAT64) Buckets->SampleCount));
2375 
2376  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2377  Buckets->ExpectedCount[i] *= AdjustFactor;
2378  }
2379 
2380  Buckets->SampleCount = NewSampleCount;
2381 
2382 } // AdjustBuckets
uinT32 SampleCount
Definition: cluster.cpp:176
double FLOAT64
Definition: host.h:112
uinT16 NumberOfBuckets
Definition: cluster.cpp:179
FLOAT32 * ExpectedCount
Definition: cluster.cpp:182
int AlphaMatch ( void *  arg1,
void *  arg2 
)

Definition at line 2407 of file cluster.cpp.

2408  { //CHISTRUCT *SearchKey)
2409 /*
2410  ** Parameters:
2411  ** ChiStruct chi-squared struct being tested for a match
2412  ** SearchKey chi-squared struct that is the search key
2413  ** Operation:
2414  ** This routine is used to search a list of structures which
2415  ** hold pre-computed chi-squared values for a chi-squared
2416  ** value whose corresponding alpha field matches the alpha
2417  ** field of SearchKey.
2418  ** It is called by the list search routines.
2419  ** Return: TRUE if ChiStruct's Alpha matches SearchKey's Alpha
2420  ** Exceptions: none
2421  ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2422  */
2423  CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
2424  CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
2425 
2426  return (ChiStruct->Alpha == SearchKey->Alpha);
2427 
2428 } // AlphaMatch
FLOAT64 Alpha
Definition: cluster.cpp:187
FLOAT64 ChiArea ( CHISTRUCT ChiParams,
FLOAT64  x 
)

Definition at line 2521 of file cluster.cpp.

2521  {
2522 /*
2523  ** Parameters:
2524  ** ChiParams contains degrees of freedom and alpha
2525  ** x value of chi-squared to evaluate
2526  ** Operation:
2527  ** This routine computes the area under a chi density curve
2528  ** from 0 to x, minus the desired area under the curve. The
2529  ** number of degrees of freedom of the chi curve is specified
2530  ** in the ChiParams structure. The desired area is also
2531  ** specified in the ChiParams structure as Alpha ( or 1 minus
2532  ** the desired area ). This routine is intended to be passed
2533  ** to the Solve() function to find the value of chi-squared
2534  ** which will yield a desired area under the right tail of
2535  ** the chi density curve. The function will only work for
2536  ** even degrees of freedom. The equations are based on
2537  ** integrating the chi density curve in parts to obtain
2538  ** a series that can be used to compute the area under the
2539  ** curve.
2540  ** Return: Error between actual and desired area under the chi curve.
2541  ** Exceptions: none
2542  ** History: Fri Aug 4 12:48:41 1989, DSJ, Created.
2543  */
2544  int i, N;
2545  FLOAT64 SeriesTotal;
2546  FLOAT64 Denominator;
2547  FLOAT64 PowerOfx;
2548 
2549  N = ChiParams->DegreesOfFreedom / 2 - 1;
2550  SeriesTotal = 1;
2551  Denominator = 1;
2552  PowerOfx = 1;
2553  for (i = 1; i <= N; i++) {
2554  Denominator *= 2 * i;
2555  PowerOfx *= x;
2556  SeriesTotal += PowerOfx / Denominator;
2557  }
2558  return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
2559 
2560 } // ChiArea
FLOAT64 Alpha
Definition: cluster.cpp:187
double FLOAT64
Definition: host.h:112
uinT16 DegreesOfFreedom
Definition: cluster.cpp:186
LIST ClusterSamples ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

ClusterSamples *********************************************************** Parameters: Clusterer data struct containing samples to be clustered Config parameters which control clustering process Operation: This routine first checks to see if the samples in this clusterer have already been clustered before; if so, it does not bother to recreate the cluster tree. It simply recomputes the prototypes based on the new Config info. If the samples have not been clustered before, the samples in the KD tree are formed into a cluster tree and then the prototypes are computed from the cluster tree. In either case this routine returns a pointer to a list of prototypes that best represent the samples given the constraints specified in Config. Return: Pointer to a list of prototypes Exceptions: None History: 5/29/89, DSJ, Created.

Definition at line 504 of file cluster.cpp.

504  {
505  //only create cluster tree if samples have never been clustered before
506  if (Clusterer->Root == NULL)
507  CreateClusterTree(Clusterer);
508 
509  //deallocate the old prototype list if one exists
510  FreeProtoList (&Clusterer->ProtoList);
511  Clusterer->ProtoList = NIL_LIST;
512 
513  //compute prototypes starting at the root node in the tree
514  ComputePrototypes(Clusterer, Config);
515  return (Clusterer->ProtoList);
516 } // ClusterSamples
#define NIL_LIST
Definition: oldlist.h:126
#define NULL
Definition: host.h:144
void CreateClusterTree(CLUSTERER *Clusterer)
Definition: cluster.cpp:694
void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:927
CLUSTER * Root
Definition: cluster.h:91
LIST ProtoList
Definition: cluster.h:92
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:560
FLOAT64 ComputeChiSquared ( uinT16  DegreesOfFreedom,
FLOAT64  Alpha 
)

Definition at line 1880 of file cluster.cpp.

1903 {
1904  static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
1905 
1906  CHISTRUCT *OldChiSquared;
1907  CHISTRUCT SearchKey;
1908 
1909  // limit the minimum alpha that can be used - if alpha is too small
1910  // it may not be possible to compute chi-squared.
1911  Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
1912  if (Odd (DegreesOfFreedom))
1913  DegreesOfFreedom++;
1914 
1915  /* find the list of chi-squared values which have already been computed
1916  for the specified number of degrees of freedom. Search the list for
1917  the desired chi-squared. */
1918  SearchKey.Alpha = Alpha;
1919  OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
1920  &SearchKey, AlphaMatch));
1921 
1922  if (OldChiSquared == NULL) {
1923  OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
1924  OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
1925  (FLOAT64) DegreesOfFreedom,
1926  (FLOAT64) CHIACCURACY);
1927  ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
1928  OldChiSquared);
1929  }
1930  else {
1931  // further optimization might move OldChiSquared to front of list
1932  }
1933 
1934  return (OldChiSquared->ChiSquared);
1935 
1936 } // ComputeChiSquared
FLOAT64 Alpha
Definition: cluster.cpp:187
#define CHIACCURACY
CHISTRUCT * NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:2432
#define Odd(N)
Definition: cluster.cpp:202
#define NULL
Definition: host.h:144
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
double FLOAT64
Definition: host.h:112
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2286
int AlphaMatch(void *arg1, void *arg2)
Definition: cluster.cpp:2407
#define MAXDEGREESOFFREEDOM
Definition: cluster.cpp:225
#define MINALPHA
FLOAT64 ChiSquared
Definition: cluster.cpp:188
FLOAT64 Solve(SOLVEFUNC Function, void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
Definition: cluster.cpp:2457
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:413
#define first_node(l)
Definition: oldlist.h:139
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:64
FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x)
Definition: cluster.cpp:2521
void ComputePrototypes ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

ComputePrototypes ******************************************************* Parameters: Clusterer data structure holding cluster tree Config parameters used to control prototype generation Operation: This routine decides which clusters in the cluster tree should be represented by prototypes, forms a list of these prototypes, and places the list in the Clusterer data structure. Return: None Exceptions: None History: 5/30/89, DSJ, Created.

Definition at line 927 of file cluster.cpp.

927  {
928  LIST ClusterStack = NIL_LIST;
929  CLUSTER *Cluster;
930  PROTOTYPE *Prototype;
931 
932  // use a stack to keep track of clusters waiting to be processed
933  // initially the only cluster on the stack is the root cluster
934  if (Clusterer->Root != NULL)
935  ClusterStack = push (NIL_LIST, Clusterer->Root);
936 
937  // loop until we have analyzed all clusters which are potential prototypes
938  while (ClusterStack != NIL_LIST) {
939  // remove the next cluster to be analyzed from the stack
940  // try to make a prototype from the cluster
941  // if successful, put it on the proto list, else split the cluster
942  Cluster = (CLUSTER *) first_node (ClusterStack);
943  ClusterStack = pop (ClusterStack);
944  Prototype = MakePrototype(Clusterer, Config, Cluster);
945  if (Prototype != NULL) {
946  Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
947  }
948  else {
949  ClusterStack = push (ClusterStack, Cluster->Right);
950  ClusterStack = push (ClusterStack, Cluster->Left);
951  }
952  }
953 } // ComputePrototypes
LIST pop(LIST list)
Definition: oldlist.cpp:305
Definition: cluster.h:32
#define NIL_LIST
Definition: oldlist.h:126
#define NULL
Definition: host.h:144
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
CLUSTER * Root
Definition: cluster.h:91
struct sample * Right
Definition: cluster.h:37
LIST ProtoList
Definition: cluster.h:92
struct sample * Left
Definition: cluster.h:36
PROTOTYPE * MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
Definition: cluster.cpp:974
#define first_node(l)
Definition: oldlist.h:139
STATISTICS * ComputeStatistics ( inT16  N,
PARAM_DESC  ParamDesc[],
CLUSTER Cluster 
)

ComputeStatistics ********************************************************* Parameters: N number of dimensions ParamDesc array of dimension descriptions Cluster cluster whose stats are to be computed Operation: This routine searches the cluster tree for all leaf nodes which are samples in the specified cluster. It computes a full covariance matrix for these samples as well as keeping track of the ranges (min and max) for each dimension. A special data structure is allocated to return this information to the caller. An incremental algorithm for computing statistics is not used because it will not work with circular dimensions. Return: Pointer to new data structure containing statistics Exceptions: None History: 6/2/89, DSJ, Created.

Definition at line 1422 of file cluster.cpp.

1422  {
1423  STATISTICS *Statistics;
1424  int i, j;
1425  FLOAT32 *CoVariance;
1426  FLOAT32 *Distance;
1427  LIST SearchState;
1428  SAMPLE *Sample;
1429  uinT32 SampleCountAdjustedForBias;
1430 
1431  // allocate memory to hold the statistics results
1432  Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
1433  Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
1434  Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1435  Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1436 
1437  // allocate temporary memory to hold the sample to mean distances
1438  Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1439 
1440  // initialize the statistics
1441  Statistics->AvgVariance = 1.0;
1442  CoVariance = Statistics->CoVariance;
1443  for (i = 0; i < N; i++) {
1444  Statistics->Min[i] = 0.0;
1445  Statistics->Max[i] = 0.0;
1446  for (j = 0; j < N; j++, CoVariance++)
1447  *CoVariance = 0;
1448  }
1449  // find each sample in the cluster and merge it into the statistics
1450  InitSampleSearch(SearchState, Cluster);
1451  while ((Sample = NextSample (&SearchState)) != NULL) {
1452  for (i = 0; i < N; i++) {
1453  Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
1454  if (ParamDesc[i].Circular) {
1455  if (Distance[i] > ParamDesc[i].HalfRange)
1456  Distance[i] -= ParamDesc[i].Range;
1457  if (Distance[i] < -ParamDesc[i].HalfRange)
1458  Distance[i] += ParamDesc[i].Range;
1459  }
1460  if (Distance[i] < Statistics->Min[i])
1461  Statistics->Min[i] = Distance[i];
1462  if (Distance[i] > Statistics->Max[i])
1463  Statistics->Max[i] = Distance[i];
1464  }
1465  CoVariance = Statistics->CoVariance;
1466  for (i = 0; i < N; i++)
1467  for (j = 0; j < N; j++, CoVariance++)
1468  *CoVariance += Distance[i] * Distance[j];
1469  }
1470  // normalize the variances by the total number of samples
1471  // use SampleCount-1 instead of SampleCount to get an unbiased estimate
1472  // also compute the geometic mean of the diagonal variances
1473  // ensure that clusters with only 1 sample are handled correctly
1474  if (Cluster->SampleCount > 1)
1475  SampleCountAdjustedForBias = Cluster->SampleCount - 1;
1476  else
1477  SampleCountAdjustedForBias = 1;
1478  CoVariance = Statistics->CoVariance;
1479  for (i = 0; i < N; i++)
1480  for (j = 0; j < N; j++, CoVariance++) {
1481  *CoVariance /= SampleCountAdjustedForBias;
1482  if (j == i) {
1483  if (*CoVariance < MINVARIANCE)
1484  *CoVariance = MINVARIANCE;
1485  Statistics->AvgVariance *= *CoVariance;
1486  }
1487  }
1488  Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
1489  1.0 / N);
1490 
1491  // release temporary memory and return
1492  memfree(Distance);
1493  return (Statistics);
1494 } // ComputeStatistics
void memfree(void *element)
Definition: freelist.cpp:30
unsigned SampleCount
Definition: cluster.h:35
Definition: cluster.h:32
#define NULL
Definition: host.h:144
FLOAT32 Range
Definition: ocrfeatures.h:50
float FLOAT32
Definition: host.h:111
FLOAT32 * Min
Definition: cluster.cpp:170
#define MINVARIANCE
Definition: cluster.cpp:141
FLOAT32 * Max
Definition: cluster.cpp:171
#define InitSampleSearch(S, C)
Definition: cluster.h:105
FLOAT32 AvgVariance
Definition: cluster.cpp:168
FLOAT32 Mean[1]
Definition: cluster.h:39
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
FLOAT32 * CoVariance
Definition: cluster.cpp:169
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:614
unsigned int uinT32
Definition: host.h:103
void CreateClusterTree ( CLUSTERER Clusterer)

CreateClusterTree ******************************************************* Parameters: Clusterer data structure holdings samples to be clustered Operation: This routine performs a bottoms-up clustering on the samples held in the kd-tree of the Clusterer data structure. The result is a cluster tree. Each node in the tree represents a cluster which conceptually contains a subset of the samples. More precisely, the cluster contains all of the samples which are contained in its two sub-clusters. The leaves of the tree are the individual samples themselves; they have no sub-clusters. The root node of the tree conceptually contains all of the samples. Return: None (the Clusterer data structure is changed) Exceptions: None History: 5/29/89, DSJ, Created.

Definition at line 694 of file cluster.cpp.

694  {
695  ClusteringContext context;
696  HEAPENTRY HeapEntry;
697  TEMPCLUSTER *PotentialCluster;
698 
699  // each sample and its nearest neighbor form a "potential" cluster
700  // save these in a heap with the "best" potential clusters on top
701  context.tree = Clusterer->KDTree;
702  context.candidates = (TEMPCLUSTER *)
703  Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER));
704  context.next = 0;
705  context.heap = MakeHeap(Clusterer->NumberOfSamples);
706  KDWalk(context.tree, (void_proc)MakePotentialClusters, &context);
707 
708  // form potential clusters into actual clusters - always do "best" first
709  while (GetTopOfHeap(context.heap, &HeapEntry) != EMPTY) {
710  PotentialCluster = (TEMPCLUSTER *)HeapEntry.Data;
711 
712  // if main cluster of potential cluster is already in another cluster
713  // then we don't need to worry about it
714  if (PotentialCluster->Cluster->Clustered) {
715  continue;
716  }
717 
718  // if main cluster is not yet clustered, but its nearest neighbor is
719  // then we must find a new nearest neighbor
720  else if (PotentialCluster->Neighbor->Clustered) {
721  PotentialCluster->Neighbor =
722  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
723  &HeapEntry.Key);
724  if (PotentialCluster->Neighbor != NULL) {
725  HeapStore(context.heap, &HeapEntry);
726  }
727  }
728 
729  // if neither cluster is already clustered, form permanent cluster
730  else {
731  PotentialCluster->Cluster =
732  MakeNewCluster(Clusterer, PotentialCluster);
733  PotentialCluster->Neighbor =
734  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
735  &HeapEntry.Key);
736  if (PotentialCluster->Neighbor != NULL) {
737  HeapStore(context.heap, &HeapEntry);
738  }
739  }
740  }
741 
742  // the root node in the cluster tree is now the only node in the kd-tree
743  Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree);
744 
745  // free up the memory used by the K-D tree, heap, and temp clusters
746  FreeKDTree(context.tree);
747  Clusterer->KDTree = NULL;
748  FreeHeap(context.heap);
749  memfree(context.candidates);
750 } // CreateClusterTree
#define RootOf(T)
Definition: kdtree.h:58
CLUSTER * MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
Definition: cluster.cpp:837
void memfree(void *element)
Definition: freelist.cpp:30
Definition: cluster.h:32
#define NULL
Definition: host.h:144
HEAP * MakeHeap(int Size)
Definition: oldheap.cpp:49
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
Definition: cluster.cpp:798
void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, inT32 Level)
Definition: cluster.cpp:764
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:346
CLUSTER * Root
Definition: cluster.h:91
#define FreeHeap(H)
Definition: oldheap.h:46
int GetTopOfHeap(HEAP *Heap, HEAPENTRY *Entry)
Definition: oldheap.cpp:273
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
void HeapStore(HEAP *Heap, HEAPENTRY *Entry)
Definition: oldheap.cpp:234
void * Data
Definition: oldheap.h:34
inT32 NumberOfSamples
Definition: cluster.h:89
TEMPCLUSTER * candidates
Definition: cluster.cpp:194
void(* void_proc)(...)
Definition: cutil.h:66
CLUSTER * Cluster
Definition: cluster.cpp:163
void KDWalk(KDTREE *Tree, void_proc action, void *context)
Definition: kdtree.cpp:339
unsigned Clustered
Definition: cluster.h:33
FLOAT32 Key
Definition: oldheap.h:33
#define EMPTY
Definition: oldheap.h:29
KDTREE * KDTree
Definition: cluster.h:90
CLUSTER * Neighbor
Definition: cluster.cpp:164
uinT16 DegreesOfFreedom ( DISTRIBUTION  Distribution,
uinT16  HistogramBuckets 
)

Definition at line 2286 of file cluster.cpp.

2286  {
2287 /*
2288  ** Parameters:
2289  ** Distribution distribution being tested for
2290  ** HistogramBuckets number of buckets in chi-square test
2291  ** Operation:
2292  ** This routine computes the degrees of freedom that should
2293  ** be used in a chi-squared test with the specified number of
2294  ** histogram buckets. The result is always rounded up to
2295  ** the next even number so that the value of chi-squared can be
2296  ** computed more easily. This will cause the value of
2297  ** chi-squared to be higher than the optimum value, resulting
2298  ** in the chi-square test being more lenient than optimum.
2299  ** Return: The number of degrees of freedom for a chi-square test
2300  ** Exceptions: none
2301  ** History: Thu Aug 3 14:04:18 1989, DSJ, Created.
2302  */
2303  static uinT8 DegreeOffsets[] = { 3, 3, 1 };
2304 
2305  uinT16 AdjustedNumBuckets;
2306 
2307  AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
2308  if (Odd (AdjustedNumBuckets))
2309  AdjustedNumBuckets++;
2310  return (AdjustedNumBuckets);
2311 
2312 } // DegreesOfFreedom
#define Odd(N)
Definition: cluster.cpp:202
unsigned short uinT16
Definition: host.h:101
unsigned char uinT8
Definition: host.h:99
BOOL8 DistributionOK ( BUCKETS Buckets)

Definition at line 2187 of file cluster.cpp.

2187  {
2188 /*
2189  ** Parameters:
2190  ** Buckets histogram data to perform chi-square test on
2191  ** Operation:
2192  ** This routine performs a chi-square goodness of fit test
2193  ** on the histogram data in the Buckets data structure. TRUE
2194  ** is returned if the histogram matches the probability
2195  ** distribution which was specified when the Buckets
2196  ** structure was originally created. Otherwise FALSE is
2197  ** returned.
2198  ** Return:
2199  ** TRUE if samples match distribution, FALSE otherwise
2200  ** Exceptions:
2201  ** None
2202  ** History:
2203  ** 6/5/89, DSJ, Created.
2204  */
2205  FLOAT32 FrequencyDifference;
2206  FLOAT32 TotalDifference;
2207  int i;
2208 
2209  // compute how well the histogram matches the expected histogram
2210  TotalDifference = 0.0;
2211  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2212  FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
2213  TotalDifference += (FrequencyDifference * FrequencyDifference) /
2214  Buckets->ExpectedCount[i];
2215  }
2216 
2217  // test to see if the difference is more than expected
2218  if (TotalDifference > Buckets->ChiSquared)
2219  return FALSE;
2220  else
2221  return TRUE;
2222 } // DistributionOK
uinT32 * Count
Definition: cluster.cpp:181
FLOAT64 ChiSquared
Definition: cluster.cpp:178
#define FALSE
Definition: capi.h:28
float FLOAT32
Definition: host.h:111
uinT16 NumberOfBuckets
Definition: cluster.cpp:179
FLOAT32 * ExpectedCount
Definition: cluster.cpp:182
#define TRUE
Definition: capi.h:27
void FillBuckets ( BUCKETS Buckets,
CLUSTER Cluster,
uinT16  Dim,
PARAM_DESC ParamDesc,
FLOAT32  Mean,
FLOAT32  StdDev 
)

Definition at line 2015 of file cluster.cpp.

2020  {
2021 /*
2022  ** Parameters:
2023  ** Buckets histogram buckets to count samples
2024  ** Cluster cluster whose samples are being analyzed
2025  ** Dim dimension of samples which is being analyzed
2026  ** ParamDesc description of the dimension
2027  ** Mean "mean" of the distribution
2028  ** StdDev "standard deviation" of the distribution
2029  ** Operation:
2030  ** This routine counts the number of cluster samples which
2031  ** fall within the various histogram buckets in Buckets. Only
2032  ** one dimension of each sample is examined. The exact meaning
2033  ** of the Mean and StdDev parameters depends on the
2034  ** distribution which is being analyzed (this info is in the
2035  ** Buckets data structure). For normal distributions, Mean
2036  ** and StdDev have the expected meanings. For uniform and
2037  ** random distributions the Mean is the center point of the
2038  ** range and the StdDev is 1/2 the range. A dimension with
2039  ** zero standard deviation cannot be statistically analyzed.
2040  ** In this case, a pseudo-analysis is used.
2041  ** Return:
2042  ** None (the Buckets data structure is filled in)
2043  ** Exceptions:
2044  ** None
2045  ** History:
2046  ** 6/5/89, DSJ, Created.
2047  */
2048  uinT16 BucketID;
2049  int i;
2050  LIST SearchState;
2051  SAMPLE *Sample;
2052 
2053  // initialize the histogram bucket counts to 0
2054  for (i = 0; i < Buckets->NumberOfBuckets; i++)
2055  Buckets->Count[i] = 0;
2056 
2057  if (StdDev == 0.0) {
2058  /* if the standard deviation is zero, then we can't statistically
2059  analyze the cluster. Use a pseudo-analysis: samples exactly on
2060  the mean are distributed evenly across all buckets. Samples greater
2061  than the mean are placed in the last bucket; samples less than the
2062  mean are placed in the first bucket. */
2063 
2064  InitSampleSearch(SearchState, Cluster);
2065  i = 0;
2066  while ((Sample = NextSample (&SearchState)) != NULL) {
2067  if (Sample->Mean[Dim] > Mean)
2068  BucketID = Buckets->NumberOfBuckets - 1;
2069  else if (Sample->Mean[Dim] < Mean)
2070  BucketID = 0;
2071  else
2072  BucketID = i;
2073  Buckets->Count[BucketID] += 1;
2074  i++;
2075  if (i >= Buckets->NumberOfBuckets)
2076  i = 0;
2077  }
2078  }
2079  else {
2080  // search for all samples in the cluster and add to histogram buckets
2081  InitSampleSearch(SearchState, Cluster);
2082  while ((Sample = NextSample (&SearchState)) != NULL) {
2083  switch (Buckets->Distribution) {
2084  case normal:
2085  BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
2086  Mean, StdDev);
2087  break;
2088  case D_random:
2089  case uniform:
2090  BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
2091  Mean, StdDev);
2092  break;
2093  default:
2094  BucketID = 0;
2095  }
2096  Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2097  }
2098  }
2099 } // FillBuckets
Definition: cluster.h:32
uinT32 * Count
Definition: cluster.cpp:181
#define NULL
Definition: host.h:144
uinT16 NumberOfBuckets
Definition: cluster.cpp:179
#define InitSampleSearch(S, C)
Definition: cluster.h:105
FLOAT32 Mean[1]
Definition: cluster.h:39
uinT16 NormalBucket(PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2103
uinT16 Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:180
unsigned short uinT16
Definition: host.h:101
uinT16 UniformBucket(PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2145
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:639
DISTRIBUTION Distribution
Definition: cluster.cpp:175
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:614
Definition: cluster.h:59
CLUSTER * FindNearestNeighbor ( KDTREE Tree,
CLUSTER Cluster,
FLOAT32 Distance 
)

FindNearestNeighbor ********************************************************* Parameters: Tree kd-tree to search in for nearest neighbor Cluster cluster whose nearest neighbor is to be found Distance ptr to variable to report distance found Operation: This routine searches the specified kd-tree for the nearest neighbor of the specified cluster. It actually uses the kd routines to find the 2 nearest neighbors since one of them will be the original cluster. A pointer to the nearest neighbor is returned, if it can be found, otherwise NULL is returned. The distance between the 2 nodes is placed in the specified variable. Return: Pointer to the nearest neighbor of Cluster, or NULL Exceptions: none History: 5/29/89, DSJ, Created. 7/13/89, DSJ, Removed visibility of kd-tree node data struct

Definition at line 798 of file cluster.cpp.

801 {
802  CLUSTER *Neighbor[MAXNEIGHBORS];
803  FLOAT32 Dist[MAXNEIGHBORS];
804  int NumberOfNeighbors;
805  inT32 i;
806  CLUSTER *BestNeighbor;
807 
808  // find the 2 nearest neighbors of the cluster
810  &NumberOfNeighbors, (void **)Neighbor, Dist);
811 
812  // search for the nearest neighbor that is not the cluster itself
813  *Distance = MAXDISTANCE;
814  BestNeighbor = NULL;
815  for (i = 0; i < NumberOfNeighbors; i++) {
816  if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
817  *Distance = Dist[i];
818  BestNeighbor = Neighbor[i];
819  }
820  }
821  return BestNeighbor;
822 } // FindNearestNeighbor
#define MAXNEIGHBORS
Definition: cluster.h:32
void KDNearestNeighborSearch(KDTREE *Tree, FLOAT32 Query[], int QuerySize, FLOAT32 MaxDistance, int *NumberOfResults, void **NBuffer, FLOAT32 DBuffer[])
Definition: kdtree.cpp:307
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
float FLOAT32
Definition: host.h:111
FLOAT32 Mean[1]
Definition: cluster.h:39
#define MAXDISTANCE
void FreeBuckets ( BUCKETS Buckets)

Definition at line 2248 of file cluster.cpp.

2248  {
2249 /*
2250  ** Parameters:
2251  ** buckets pointer to data structure to be freed
2252  ** Operation:
2253  ** This routine properly frees the memory used by a BUCKETS.
2254  */
2255  Efree(buckets->Count);
2256  Efree(buckets->ExpectedCount);
2257  Efree(buckets);
2258 } // FreeBuckets
void Efree(void *ptr)
Definition: emalloc.cpp:85
void FreeCluster ( CLUSTER Cluster)

Definition at line 2262 of file cluster.cpp.

2262  {
2263 /*
2264  ** Parameters:
2265  ** Cluster pointer to cluster to be freed
2266  ** Operation:
2267  ** This routine frees the memory consumed by the specified
2268  ** cluster and all of its subclusters. This is done by
2269  ** recursive calls to FreeCluster().
2270  ** Return:
2271  ** None
2272  ** Exceptions:
2273  ** None
2274  ** History:
2275  ** 6/6/89, DSJ, Created.
2276  */
2277  if (Cluster != NULL) {
2278  FreeCluster (Cluster->Left);
2279  FreeCluster (Cluster->Right);
2280  memfree(Cluster);
2281  }
2282 } // FreeCluster
void memfree(void *element)
Definition: freelist.cpp:30
#define NULL
Definition: host.h:144
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2262
struct sample * Right
Definition: cluster.h:37
struct sample * Left
Definition: cluster.h:36
void FreeClusterer ( CLUSTERER Clusterer)

FreeClusterer ************************************************************* Parameters: Clusterer pointer to data structure to be freed Operation: This routine frees all of the memory allocated to the specified data structure. It will not, however, free the memory used by the prototype list. The pointers to the clusters for each prototype in the list will be set to NULL to indicate that the cluster data structures no longer exist. Any sample lists that have been obtained via calls to GetSamples are no longer valid. Return: None Exceptions: None History: 6/6/89, DSJ, Created.

Definition at line 532 of file cluster.cpp.

532  {
533  if (Clusterer != NULL) {
534  memfree (Clusterer->ParamDesc);
535  if (Clusterer->KDTree != NULL)
536  FreeKDTree (Clusterer->KDTree);
537  if (Clusterer->Root != NULL)
538  FreeCluster (Clusterer->Root);
539  // Free up all used buckets structures.
540  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
541  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
542  if (Clusterer->bucket_cache[d][c] != NULL)
543  FreeBuckets(Clusterer->bucket_cache[d][c]);
544  }
545 
546  memfree(Clusterer);
547  }
548 } // FreeClusterer
void FreeBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2248
void memfree(void *element)
Definition: freelist.cpp:30
PARAM_DESC * ParamDesc
Definition: cluster.h:88
#define NULL
Definition: host.h:144
#define MAXBUCKETS
Definition: cluster.h:27
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:346
CLUSTER * Root
Definition: cluster.h:91
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2262
#define MINBUCKETS
Definition: cluster.h:26
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1-MINBUCKETS]
Definition: cluster.h:95
KDTREE * KDTree
Definition: cluster.h:90
void FreeProtoList ( LIST ProtoList)

FreeProtoList ************************************************************ Parameters: ProtoList pointer to list of prototypes to be freed Operation: This routine frees all of the memory allocated to the specified list of prototypes. The clusters which are pointed to by the prototypes are not freed. Return: None Exceptions: None History: 6/6/89, DSJ, Created.

Definition at line 560 of file cluster.cpp.

560  {
561  destroy_nodes(*ProtoList, FreePrototype);
562 } // FreeProtoList
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:204
void FreePrototype(void *arg)
Definition: cluster.cpp:575
void FreePrototype ( void *  arg)

FreePrototype ************************************************************ Parameters: Prototype prototype data structure to be deallocated Operation: This routine deallocates the memory consumed by the specified prototype and modifies the corresponding cluster so that it is no longer marked as a prototype. The cluster is NOT deallocated by this routine. Return: None Exceptions: None History: 5/30/89, DSJ, Created.

Definition at line 575 of file cluster.cpp.

575  { //PROTOTYPE *Prototype)
576  PROTOTYPE *Prototype = (PROTOTYPE *) arg;
577 
578  // unmark the corresponding cluster (if there is one
579  if (Prototype->Cluster != NULL)
580  Prototype->Cluster->Prototype = FALSE;
581 
582  // deallocate the prototype statistics and then the prototype itself
583  if (Prototype->Distrib != NULL)
584  memfree (Prototype->Distrib);
585  if (Prototype->Mean != NULL)
586  memfree (Prototype->Mean);
587  if (Prototype->Style != spherical) {
588  if (Prototype->Variance.Elliptical != NULL)
589  memfree (Prototype->Variance.Elliptical);
590  if (Prototype->Magnitude.Elliptical != NULL)
591  memfree (Prototype->Magnitude.Elliptical);
592  if (Prototype->Weight.Elliptical != NULL)
593  memfree (Prototype->Weight.Elliptical);
594  }
595  memfree(Prototype);
596 } // FreePrototype
unsigned Prototype
Definition: cluster.h:34
DISTRIBUTION * Distrib
Definition: cluster.h:77
void memfree(void *element)
Definition: freelist.cpp:30
unsigned Style
Definition: cluster.h:74
FLOAT32 * Elliptical
Definition: cluster.h:64
#define NULL
Definition: host.h:144
FLOATUNION Variance
Definition: cluster.h:81
#define FALSE
Definition: capi.h:28
FLOATUNION Weight
Definition: cluster.h:83
CLUSTER * Cluster
Definition: cluster.h:76
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Mean
Definition: cluster.h:78
void FreeStatistics ( STATISTICS Statistics)

Definition at line 2226 of file cluster.cpp.

2226  {
2227 /*
2228  ** Parameters:
2229  ** Statistics pointer to data structure to be freed
2230  ** Operation:
2231  ** This routine frees the memory used by the statistics
2232  ** data structure.
2233  ** Return:
2234  ** None
2235  ** Exceptions:
2236  ** None
2237  ** History:
2238  ** 6/5/89, DSJ, Created.
2239  */
2240  memfree (Statistics->CoVariance);
2241  memfree (Statistics->Min);
2242  memfree (Statistics->Max);
2243  memfree(Statistics);
2244 } // FreeStatistics
void memfree(void *element)
Definition: freelist.cpp:30
FLOAT32 * Min
Definition: cluster.cpp:170
FLOAT32 * Max
Definition: cluster.cpp:171
FLOAT32 * CoVariance
Definition: cluster.cpp:169
BUCKETS * GetBuckets ( CLUSTERER clusterer,
DISTRIBUTION  Distribution,
uinT32  SampleCount,
FLOAT64  Confidence 
)

GetBuckets ************************************************************** Parameters: Clusterer which keeps a bucket_cache for us. Distribution type of probability distribution to test for SampleCount number of samples that are available Confidence probability of a Type I error Operation: This routine returns a histogram data structure which can be used by other routines to place samples into histogram buckets, and then apply a goodness of fit test to the histogram data to determine if the samples belong to the specified probability distribution. The routine keeps a list of bucket data structures which have already been created so that it minimizes the computation time needed to create a new bucket. Return: Bucket data structure Exceptions: none History: Thu Aug 3 12:58:10 1989, DSJ, Created.

Definition at line 1705 of file cluster.cpp.

1708  {
1709  // Get an old bucket structure with the same number of buckets.
1710  uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1711  BUCKETS *Buckets =
1712  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
1713 
1714  // If a matching bucket structure is not found, make one and save it.
1715  if (Buckets == NULL) {
1716  Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
1717  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
1718  Buckets;
1719  } else {
1720  // Just adjust the existing buckets.
1721  if (SampleCount != Buckets->SampleCount)
1722  AdjustBuckets(Buckets, SampleCount);
1723  if (Confidence != Buckets->Confidence) {
1724  Buckets->Confidence = Confidence;
1725  Buckets->ChiSquared = ComputeChiSquared(
1726  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets),
1727  Confidence);
1728  }
1729  InitBuckets(Buckets);
1730  }
1731  return Buckets;
1732 } // GetBuckets
uinT32 SampleCount
Definition: cluster.cpp:176
void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount)
Definition: cluster.cpp:2357
FLOAT64 ChiSquared
Definition: cluster.cpp:178
#define NULL
Definition: host.h:144
uinT16 NumberOfBuckets
Definition: cluster.cpp:179
FLOAT64 Confidence
Definition: cluster.cpp:177
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2286
FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:1880
#define MINBUCKETS
Definition: cluster.h:26
unsigned short uinT16
Definition: host.h:101
void InitBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2386
BUCKETS * MakeBuckets(DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1755
uinT16 OptimumNumberOfBuckets(uinT32 SampleCount)
Definition: cluster.cpp:1839
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1-MINBUCKETS]
Definition: cluster.h:95
BOOL8 Independent ( PARAM_DESC  ParamDesc[],
inT16  N,
FLOAT32 CoVariance,
FLOAT32  Independence 
)

Independent *************************************************************** Parameters: ParamDesc descriptions of each feature space dimension N number of dimensions CoVariance ptr to a covariance matrix Independence max off-diagonal correlation coefficient Operation: This routine returns TRUE if the specified covariance matrix indicates that all N dimensions are independent of one another. One dimension is judged to be independent of another when the magnitude of the corresponding correlation coefficient is less than the specified Independence factor. The correlation coefficient is calculated as: (see Duda and Hart, pg. 247) coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj]) The covariance matrix is assumed to be symmetric (which should always be true). Return: TRUE if dimensions are independent, FALSE otherwise Exceptions: None History: 6/4/89, DSJ, Created.

Definition at line 1656 of file cluster.cpp.

1657  {
1658  int i, j;
1659  FLOAT32 *VARii; // points to ith on-diagonal element
1660  FLOAT32 *VARjj; // points to jth on-diagonal element
1661  FLOAT32 CorrelationCoeff;
1662 
1663  VARii = CoVariance;
1664  for (i = 0; i < N; i++, VARii += N + 1) {
1665  if (ParamDesc[i].NonEssential)
1666  continue;
1667 
1668  VARjj = VARii + N + 1;
1669  CoVariance = VARii + 1;
1670  for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1671  if (ParamDesc[j].NonEssential)
1672  continue;
1673 
1674  if ((*VARii == 0.0) || (*VARjj == 0.0))
1675  CorrelationCoeff = 0.0;
1676  else
1677  CorrelationCoeff =
1678  sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1679  if (CorrelationCoeff > Independence)
1680  return (FALSE);
1681  }
1682  }
1683  return (TRUE);
1684 } // Independent
#define FALSE
Definition: capi.h:28
float FLOAT32
Definition: host.h:111
#define TRUE
Definition: capi.h:27
void InitBuckets ( BUCKETS Buckets)

Definition at line 2386 of file cluster.cpp.

2386  {
2387 /*
2388  ** Parameters:
2389  ** Buckets histogram data structure to init
2390  ** Operation:
2391  ** This routine sets the bucket counts in the specified histogram
2392  ** to zero.
2393  ** Return: none
2394  ** Exceptions: none
2395  ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2396  */
2397  int i;
2398 
2399  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2400  Buckets->Count[i] = 0;
2401  }
2402 
2403 } // InitBuckets
uinT32 * Count
Definition: cluster.cpp:181
uinT16 NumberOfBuckets
Definition: cluster.cpp:179
FLOAT64 Integral ( FLOAT64  f1,
FLOAT64  f2,
FLOAT64  Dx 
)

Definition at line 1994 of file cluster.cpp.

1994  {
1995 /*
1996  ** Parameters:
1997  ** f1 value of function at x1
1998  ** f2 value of function at x2
1999  ** Dx x2 - x1 (should always be positive)
2000  ** Operation:
2001  ** This routine computes a trapezoidal approximation to the
2002  ** integral of a function over a small delta in x.
2003  ** Return:
2004  ** Approximation of the integral of the function from x1 to x2.
2005  ** Exceptions:
2006  ** None
2007  ** History:
2008  ** 6/5/89, DSJ, Created.
2009  */
2010  return (f1 + f2) * Dx / 2.0;
2011 } // Integral
double InvertMatrix ( const float *  input,
int  size,
float *  inv 
)

Definition at line 2648 of file cluster.cpp.

2648  {
2649  // Allocate memory for the 2D arrays.
2650  GENERIC_2D_ARRAY<double> U(size, size, 0.0);
2651  GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
2652  GENERIC_2D_ARRAY<double> L(size, size, 0.0);
2653 
2654  // Initialize the working matrices. U starts as input, L as I and U_inv as O.
2655  int row;
2656  int col;
2657  for (row = 0; row < size; row++) {
2658  for (col = 0; col < size; col++) {
2659  U[row][col] = input[row*size + col];
2660  L[row][col] = row == col ? 1.0 : 0.0;
2661  U_inv[row][col] = 0.0;
2662  }
2663  }
2664 
2665  // Compute forward matrix by inversion by LU decomposition of input.
2666  for (col = 0; col < size; ++col) {
2667  // Find best pivot
2668  int best_row = 0;
2669  double best_pivot = -1.0;
2670  for (row = col; row < size; ++row) {
2671  if (Abs(U[row][col]) > best_pivot) {
2672  best_pivot = Abs(U[row][col]);
2673  best_row = row;
2674  }
2675  }
2676  // Exchange pivot rows.
2677  if (best_row != col) {
2678  for (int k = 0; k < size; ++k) {
2679  double tmp = U[best_row][k];
2680  U[best_row][k] = U[col][k];
2681  U[col][k] = tmp;
2682  tmp = L[best_row][k];
2683  L[best_row][k] = L[col][k];
2684  L[col][k] = tmp;
2685  }
2686  }
2687  // Now do the pivot itself.
2688  for (row = col + 1; row < size; ++row) {
2689  double ratio = -U[row][col] / U[col][col];
2690  for (int j = col; j < size; ++j) {
2691  U[row][j] += U[col][j] * ratio;
2692  }
2693  for (int k = 0; k < size; ++k) {
2694  L[row][k] += L[col][k] * ratio;
2695  }
2696  }
2697  }
2698  // Next invert U.
2699  for (col = 0; col < size; ++col) {
2700  U_inv[col][col] = 1.0 / U[col][col];
2701  for (row = col - 1; row >= 0; --row) {
2702  double total = 0.0;
2703  for (int k = col; k > row; --k) {
2704  total += U[row][k] * U_inv[k][col];
2705  }
2706  U_inv[row][col] = -total / U[row][row];
2707  }
2708  }
2709  // Now the answer is U_inv.L.
2710  for (row = 0; row < size; row++) {
2711  for (col = 0; col < size; col++) {
2712  double sum = 0.0;
2713  for (int k = row; k < size; ++k) {
2714  sum += U_inv[row][k] * L[k][col];
2715  }
2716  inv[row*size + col] = sum;
2717  }
2718  }
2719  // Check matrix product.
2720  double error_sum = 0.0;
2721  for (row = 0; row < size; row++) {
2722  for (col = 0; col < size; col++) {
2723  double sum = 0.0;
2724  for (int k = 0; k < size; ++k) {
2725  sum += input[row*size + k] * inv[k *size + col];
2726  }
2727  if (row != col) {
2728  error_sum += Abs(sum);
2729  }
2730  }
2731  }
2732  return error_sum;
2733 }
#define Abs(N)
Definition: cluster.cpp:204
int ListEntryMatch ( void *  arg1,
void *  arg2 
)

Definition at line 2339 of file cluster.cpp.

2340  { //Key
2341 /*
2342  ** Parameters: none
2343  ** Operation:
2344  ** This routine is used to search a list for a list node
2345  ** whose contents match Key. It is called by the list
2346  ** delete_d routine.
2347  ** Return: TRUE if ListNode matches Key
2348  ** Exceptions: none
2349  ** History: Thu Aug 3 14:23:58 1989, DSJ, Created.
2350  */
2351  return (arg1 == arg2);
2352 
2353 } // ListEntryMatch
BUCKETS * MakeBuckets ( DISTRIBUTION  Distribution,
uinT32  SampleCount,
FLOAT64  Confidence 
)

Makebuckets ************************************************************* Parameters: Distribution type of probability distribution to test for SampleCount number of samples that are available Confidence probability of a Type I error Operation: This routine creates a histogram data structure which can be used by other routines to place samples into histogram buckets, and then apply a goodness of fit test to the histogram data to determine if the samples belong to the specified probability distribution. The buckets are allocated in such a way that the expected frequency of samples in each bucket is approximately the same. In order to make this possible, a mapping table is computed which maps "normalized" samples into the appropriate bucket. Return: Pointer to new histogram data structure Exceptions: None History: 6/4/89, DSJ, Created.

Definition at line 1755 of file cluster.cpp.

1757  {
1758  const DENSITYFUNC DensityFunction[] =
1759  { NormalDensity, UniformDensity, UniformDensity };
1760  int i, j;
1761  BUCKETS *Buckets;
1762  FLOAT64 BucketProbability;
1763  FLOAT64 NextBucketBoundary;
1764  FLOAT64 Probability;
1765  FLOAT64 ProbabilityDelta;
1766  FLOAT64 LastProbDensity;
1767  FLOAT64 ProbDensity;
1768  uinT16 CurrentBucket;
1769  BOOL8 Symmetrical;
1770 
1771  // allocate memory needed for data structure
1772  Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS)));
1773  Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1774  Buckets->SampleCount = SampleCount;
1775  Buckets->Confidence = Confidence;
1776  Buckets->Count = reinterpret_cast<uinT32*>(
1777  Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32)));
1778  Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>(
1779  Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32)));
1780 
1781  // initialize simple fields
1782  Buckets->Distribution = Distribution;
1783  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
1784  Buckets->Count[i] = 0;
1785  Buckets->ExpectedCount[i] = 0.0;
1786  }
1787 
1788  // all currently defined distributions are symmetrical
1789  Symmetrical = TRUE;
1790  Buckets->ChiSquared = ComputeChiSquared(
1791  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
1792 
1793  if (Symmetrical) {
1794  // allocate buckets so that all have approx. equal probability
1795  BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
1796 
1797  // distribution is symmetric so fill in upper half then copy
1798  CurrentBucket = Buckets->NumberOfBuckets / 2;
1799  if (Odd (Buckets->NumberOfBuckets))
1800  NextBucketBoundary = BucketProbability / 2;
1801  else
1802  NextBucketBoundary = BucketProbability;
1803 
1804  Probability = 0.0;
1805  LastProbDensity =
1806  (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
1807  for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
1808  ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
1809  ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
1810  Probability += ProbabilityDelta;
1811  if (Probability > NextBucketBoundary) {
1812  if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1813  CurrentBucket++;
1814  NextBucketBoundary += BucketProbability;
1815  }
1816  Buckets->Bucket[i] = CurrentBucket;
1817  Buckets->ExpectedCount[CurrentBucket] +=
1818  (FLOAT32) (ProbabilityDelta * SampleCount);
1819  LastProbDensity = ProbDensity;
1820  }
1821  // place any leftover probability into the last bucket
1822  Buckets->ExpectedCount[CurrentBucket] +=
1823  (FLOAT32) ((0.5 - Probability) * SampleCount);
1824 
1825  // copy upper half of distribution to lower half
1826  for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
1827  Buckets->Bucket[i] =
1828  Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
1829 
1830  // copy upper half of expected counts to lower half
1831  for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
1832  Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
1833  }
1834  return Buckets;
1835 } // MakeBuckets
#define Odd(N)
Definition: cluster.cpp:202
uinT32 SampleCount
Definition: cluster.cpp:176
unsigned char BOOL8
Definition: host.h:113
uinT32 * Count
Definition: cluster.cpp:181
FLOAT64 ChiSquared
Definition: cluster.cpp:178
#define Mirror(N, R)
Definition: cluster.cpp:203
float FLOAT32
Definition: host.h:111
FLOAT64 UniformDensity(inT32 x)
Definition: cluster.cpp:1969
double FLOAT64
Definition: host.h:112
uinT16 NumberOfBuckets
Definition: cluster.cpp:179
FLOAT64 NormalDensity(inT32 x)
Definition: cluster.cpp:1940
FLOAT32 * ExpectedCount
Definition: cluster.cpp:182
FLOAT64 Confidence
Definition: cluster.cpp:177
FLOAT64(* DENSITYFUNC)(inT32)
Definition: cluster.cpp:199
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2286
FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:1880
uinT16 Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:180
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
unsigned short uinT16
Definition: host.h:101
FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx)
Definition: cluster.cpp:1994
DISTRIBUTION Distribution
Definition: cluster.cpp:175
#define BUCKETTABLESIZE
Definition: cluster.cpp:159
unsigned int uinT32
Definition: host.h:103
uinT16 OptimumNumberOfBuckets(uinT32 SampleCount)
Definition: cluster.cpp:1839
#define TRUE
Definition: capi.h:27
CLUSTERER* MakeClusterer ( inT16  SampleSize,
const PARAM_DESC  ParamDesc[] 
)

MakeClusterer ********************************************************** Parameters: SampleSize number of dimensions in feature space ParamDesc description of each dimension Operation: This routine creates a new clusterer data structure, initializes it, and returns a pointer to it. Return: pointer to the new clusterer data structure Exceptions: None History: 5/29/89, DSJ, Created.

Definition at line 395 of file cluster.cpp.

395  {
396  CLUSTERER *Clusterer;
397  int i;
398 
399  // allocate main clusterer data structure and init simple fields
400  Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
401  Clusterer->SampleSize = SampleSize;
402  Clusterer->NumberOfSamples = 0;
403  Clusterer->NumChar = 0;
404 
405  // init fields which will not be used initially
406  Clusterer->Root = NULL;
407  Clusterer->ProtoList = NIL_LIST;
408 
409  // maintain a copy of param descriptors in the clusterer data structure
410  Clusterer->ParamDesc =
411  (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
412  for (i = 0; i < SampleSize; i++) {
413  Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
414  Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
415  Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
416  Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
417  Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
418  Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
419  Clusterer->ParamDesc[i].MidRange =
420  (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
421  }
422 
423  // allocate a kd tree to hold the samples
424  Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
425 
426  // Initialize cache of histogram buckets to minimize recomputing them.
427  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
428  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
429  Clusterer->bucket_cache[d][c] = NULL;
430  }
431 
432  return Clusterer;
433 } // MakeClusterer
PARAM_DESC * ParamDesc
Definition: cluster.h:88
FLOAT32 MidRange
Definition: ocrfeatures.h:52
#define NIL_LIST
Definition: oldlist.h:126
inT32 NumChar
Definition: cluster.h:93
inT8 Circular
Definition: ocrfeatures.h:46
FLOAT32 HalfRange
Definition: ocrfeatures.h:51
#define NULL
Definition: host.h:144
FLOAT32 Range
Definition: ocrfeatures.h:50
#define MAXBUCKETS
Definition: cluster.h:27
KDTREE * MakeKDTree(inT16 KeySize, const PARAM_DESC KeyDesc[])
Definition: kdtree.cpp:184
inT16 SampleSize
Definition: cluster.h:87
CLUSTER * Root
Definition: cluster.h:91
FLOAT32 Min
Definition: ocrfeatures.h:48
FLOAT32 Max
Definition: ocrfeatures.h:49
#define MINBUCKETS
Definition: cluster.h:26
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
LIST ProtoList
Definition: cluster.h:92
inT32 NumberOfSamples
Definition: cluster.h:89
inT8 NonEssential
Definition: ocrfeatures.h:47
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1-MINBUCKETS]
Definition: cluster.h:95
KDTREE * KDTree
Definition: cluster.h:90
PROTOTYPE * MakeDegenerateProto ( uinT16  N,
CLUSTER Cluster,
STATISTICS Statistics,
PROTOSTYLE  Style,
inT32  MinSamples 
)

MakeDegenerateProto ****************************************************** Parameters: N number of dimensions Cluster cluster being analyzed Statistics statistical info about cluster Style type of prototype to be generated MinSamples minimum number of samples in a cluster Operation: This routine checks for clusters which are degenerate and therefore cannot be analyzed in a statistically valid way. A cluster is defined as degenerate if it does not have at least MINSAMPLESNEEDED samples in it. If the cluster is found to be degenerate, a prototype of the specified style is generated and marked as insignificant. A cluster is also degenerate if it does not have at least MinSamples samples in it. If the cluster is not degenerate, NULL is returned. Return: Pointer to degenerate prototype or NULL. Exceptions: None History: 6/20/89, DSJ, Created. 7/12/89, DSJ, Changed name and added check for 0 stddev. 8/8/89, DSJ, Removed check for 0 stddev (handled elsewhere).

Definition at line 1067 of file cluster.cpp.

1072  {
1073  PROTOTYPE *Proto = NULL;
1074 
1075  if (MinSamples < MINSAMPLESNEEDED)
1076  MinSamples = MINSAMPLESNEEDED;
1077 
1078  if (Cluster->SampleCount < MinSamples) {
1079  switch (Style) {
1080  case spherical:
1081  Proto = NewSphericalProto (N, Cluster, Statistics);
1082  break;
1083  case elliptical:
1084  case automatic:
1085  Proto = NewEllipticalProto (N, Cluster, Statistics);
1086  break;
1087  case mixed:
1088  Proto = NewMixedProto (N, Cluster, Statistics);
1089  break;
1090  }
1091  Proto->Significant = FALSE;
1092  }
1093  return (Proto);
1094 } // MakeDegenerateProto
PROTOTYPE * NewSphericalProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1510
unsigned SampleCount
Definition: cluster.h:35
PROTOTYPE * NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1589
#define NULL
Definition: host.h:144
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1544
#define FALSE
Definition: capi.h:28
Definition: cluster.h:45
#define MINSAMPLESNEEDED
Definition: cluster.cpp:151
unsigned Significant
Definition: cluster.h:68
void MakeDimRandom ( uinT16  i,
PROTOTYPE Proto,
PARAM_DESC ParamDesc 
)

Definition at line 1360 of file cluster.cpp.

1360  {
1361  Proto->Distrib[i] = D_random;
1362  Proto->Mean[i] = ParamDesc->MidRange;
1363  Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
1364 
1365  // subtract out the previous magnitude of this dimension from the total
1366  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1367  Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
1368  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1369  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1370 
1371  // note that the proto Weight is irrelevant for D_random protos
1372 } // MakeDimRandom
FLOAT32 TotalMagnitude
Definition: cluster.h:79
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 MidRange
Definition: ocrfeatures.h:52
FLOAT32 * Elliptical
Definition: cluster.h:64
FLOAT32 HalfRange
Definition: ocrfeatures.h:51
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 Range
Definition: ocrfeatures.h:50
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Mean
Definition: cluster.h:78
void MakeDimUniform ( uinT16  i,
PROTOTYPE Proto,
STATISTICS Statistics 
)

MakeDimUniform *********************************************************** Parameters: i index of dimension to be changed Proto prototype whose dimension is to be altered Statistics statistical info about prototype Operation: This routine alters the ith dimension of the specified mixed prototype to be uniform. Return: None Exceptions: None History: 6/20/89, DSJ, Created.

Definition at line 1385 of file cluster.cpp.

1385  {
1386  Proto->Distrib[i] = uniform;
1387  Proto->Mean[i] = Proto->Cluster->Mean[i] +
1388  (Statistics->Min[i] + Statistics->Max[i]) / 2;
1389  Proto->Variance.Elliptical[i] =
1390  (Statistics->Max[i] - Statistics->Min[i]) / 2;
1391  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1392  Proto->Variance.Elliptical[i] = MINVARIANCE;
1393 
1394  // subtract out the previous magnitude of this dimension from the total
1395  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1396  Proto->Magnitude.Elliptical[i] =
1397  1.0 / (2.0 * Proto->Variance.Elliptical[i]);
1398  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1399  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1400 
1401  // note that the proto Weight is irrelevant for uniform protos
1402 } // MakeDimUniform
FLOAT32 TotalMagnitude
Definition: cluster.h:79
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 * Elliptical
Definition: cluster.h:64
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 * Min
Definition: cluster.cpp:170
#define MINVARIANCE
Definition: cluster.cpp:141
FLOAT32 * Max
Definition: cluster.cpp:171
FLOAT32 Mean[1]
Definition: cluster.h:39
FLOAT32 LogMagnitude
Definition: cluster.h:80
CLUSTER * Cluster
Definition: cluster.h:76
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Mean
Definition: cluster.h:78
PROTOTYPE * MakeEllipticalProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS Buckets 
)

MakeEllipticalProto **************************************************** Parameters: Clusterer data struct containing samples being clustered Cluster cluster to be made into an elliptical prototype Statistics statistical info about cluster Buckets histogram struct used to analyze distribution Operation: This routine tests the specified cluster to see if it can be approximated by an elliptical normal distribution. If it can be, then a new prototype is formed and returned to the caller. If it can't be, then NULL is returned to the caller. Return: Pointer to new elliptical prototype or NULL. Exceptions: None History: 6/12/89, DSJ, Created.

Definition at line 1254 of file cluster.cpp.

1257  {
1258  PROTOTYPE *Proto = NULL;
1259  int i;
1260 
1261  // check that each dimension is a normal distribution
1262  for (i = 0; i < Clusterer->SampleSize; i++) {
1263  if (Clusterer->ParamDesc[i].NonEssential)
1264  continue;
1265 
1266  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1267  Cluster->Mean[i],
1268  sqrt ((FLOAT64) Statistics->
1269  CoVariance[i * (Clusterer->SampleSize + 1)]));
1270  if (!DistributionOK (Buckets))
1271  break;
1272  }
1273  // if all dimensions matched a normal distribution, make a proto
1274  if (i >= Clusterer->SampleSize)
1275  Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1276  return (Proto);
1277 } // MakeEllipticalProto
PARAM_DESC * ParamDesc
Definition: cluster.h:88
#define NULL
Definition: host.h:144
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1544
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2015
double FLOAT64
Definition: host.h:112
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2187
inT16 SampleSize
Definition: cluster.h:87
FLOAT32 Mean[1]
Definition: cluster.h:39
inT8 NonEssential
Definition: ocrfeatures.h:47
PROTOTYPE * MakeMixedProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS NormalBuckets,
FLOAT64  Confidence 
)

MakeMixedProto *********************************************************** Parameters: Clusterer data struct containing samples being clustered Cluster cluster to be made into a prototype Statistics statistical info about cluster NormalBuckets histogram struct used to analyze distribution Confidence confidence level for alternate distributions Operation: This routine tests each dimension of the specified cluster to see what distribution would best approximate that dimension. Each dimension is compared to the following distributions in order: normal, random, uniform. If each dimension can be represented by one of these distributions, then a new prototype is formed and returned to the caller. If it can't be, then NULL is returned to the caller. Return: Pointer to new mixed prototype or NULL. Exceptions: None History: 6/12/89, DSJ, Created.

Definition at line 1298 of file cluster.cpp.

1302  {
1303  PROTOTYPE *Proto;
1304  int i;
1305  BUCKETS *UniformBuckets = NULL;
1306  BUCKETS *RandomBuckets = NULL;
1307 
1308  // create a mixed proto to work on - initially assume all dimensions normal*/
1309  Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
1310 
1311  // find the proper distribution for each dimension
1312  for (i = 0; i < Clusterer->SampleSize; i++) {
1313  if (Clusterer->ParamDesc[i].NonEssential)
1314  continue;
1315 
1316  FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1317  Proto->Mean[i],
1318  sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
1319  if (DistributionOK (NormalBuckets))
1320  continue;
1321 
1322  if (RandomBuckets == NULL)
1323  RandomBuckets =
1324  GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
1325  MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
1326  FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1327  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1328  if (DistributionOK (RandomBuckets))
1329  continue;
1330 
1331  if (UniformBuckets == NULL)
1332  UniformBuckets =
1333  GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
1334  MakeDimUniform(i, Proto, Statistics);
1335  FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1336  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1337  if (DistributionOK (UniformBuckets))
1338  continue;
1339  break;
1340  }
1341  // if any dimension failed to match a distribution, discard the proto
1342  if (i < Clusterer->SampleSize) {
1343  FreePrototype(Proto);
1344  Proto = NULL;
1345  }
1346  return (Proto);
1347 } // MakeMixedProto
void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
Definition: cluster.cpp:1360
unsigned SampleCount
Definition: cluster.h:35
PARAM_DESC * ParamDesc
Definition: cluster.h:88
PROTOTYPE * NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1589
FLOAT32 * Elliptical
Definition: cluster.h:64
#define NULL
Definition: host.h:144
FLOATUNION Variance
Definition: cluster.h:81
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1705
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2015
double FLOAT64
Definition: host.h:112
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2187
inT16 SampleSize
Definition: cluster.h:87
void FreePrototype(void *arg)
Definition: cluster.cpp:575
inT8 NonEssential
Definition: ocrfeatures.h:47
void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics)
Definition: cluster.cpp:1385
FLOAT32 * Mean
Definition: cluster.h:78
CLUSTER * MakeNewCluster ( CLUSTERER Clusterer,
TEMPCLUSTER TempCluster 
)

MakeNewCluster ************************************************************* Parameters: Clusterer current clustering environment TempCluster potential cluster to make permanent Operation: This routine creates a new permanent cluster from the clusters specified in TempCluster. The 2 clusters in TempCluster are marked as "clustered" and deleted from the kd-tree. The new cluster is then added to the kd-tree. Return: Pointer to the new permanent cluster Exceptions: none History: 5/29/89, DSJ, Created. 7/13/89, DSJ, Removed visibility of kd-tree node data struct

Definition at line 837 of file cluster.cpp.

837  {
838  CLUSTER *Cluster;
839 
840  // allocate the new cluster and initialize it
841  Cluster = (CLUSTER *) Emalloc(
842  sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(FLOAT32));
843  Cluster->Clustered = FALSE;
844  Cluster->Prototype = FALSE;
845  Cluster->Left = TempCluster->Cluster;
846  Cluster->Right = TempCluster->Neighbor;
847  Cluster->CharID = -1;
848 
849  // mark the old clusters as "clustered" and delete them from the kd-tree
850  Cluster->Left->Clustered = TRUE;
851  Cluster->Right->Clustered = TRUE;
852  KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
853  KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
854 
855  // compute the mean and sample count for the new cluster
856  Cluster->SampleCount =
857  MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
858  Cluster->Left->SampleCount, Cluster->Right->SampleCount,
859  Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
860 
861  // add the new cluster to the KD tree
862  KDStore(Clusterer->KDTree, Cluster->Mean, Cluster);
863  return Cluster;
864 } // MakeNewCluster
unsigned Prototype
Definition: cluster.h:34
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data)
Definition: kdtree.cpp:209
unsigned SampleCount
Definition: cluster.h:35
PARAM_DESC * ParamDesc
Definition: cluster.h:88
void KDDelete(KDTREE *Tree, FLOAT32 Key[], void *Data)
Definition: kdtree.cpp:269
Definition: cluster.h:32
#define FALSE
Definition: capi.h:28
float FLOAT32
Definition: host.h:111
inT16 SampleSize
Definition: cluster.h:87
FLOAT32 Mean[1]
Definition: cluster.h:39
struct sample * Right
Definition: cluster.h:37
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
CLUSTER * Cluster
Definition: cluster.cpp:163
unsigned Clustered
Definition: cluster.h:33
struct sample * Left
Definition: cluster.h:36
KDTREE * KDTree
Definition: cluster.h:90
inT32 CharID
Definition: cluster.h:38
CLUSTER * Neighbor
Definition: cluster.cpp:164
#define TRUE
Definition: capi.h:27
void MakePotentialClusters ( ClusteringContext context,
CLUSTER Cluster,
inT32  Level 
)

MakePotentialClusters ************************************************** Parameters: context ClusteringContext (see definition above) Cluster current cluster being visited in kd-tree walk Level level of this cluster in the kd-tree Operation: This routine is designed to be used in concert with the KDWalk routine. It will create a potential cluster for each sample in the kd-tree that is being walked. This potential cluster will then be pushed on the heap.

Definition at line 764 of file cluster.cpp.

765  {
766  HEAPENTRY HeapEntry;
767  int next = context->next;
768  context->candidates[next].Cluster = Cluster;
769  HeapEntry.Data = (char *) &(context->candidates[next]);
770  context->candidates[next].Neighbor =
771  FindNearestNeighbor(context->tree,
772  context->candidates[next].Cluster,
773  &HeapEntry.Key);
774  if (context->candidates[next].Neighbor != NULL) {
775  HeapStore(context->heap, &HeapEntry);
776  context->next++;
777  }
778 } // MakePotentialClusters
#define NULL
Definition: host.h:144
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
Definition: cluster.cpp:798
void HeapStore(HEAP *Heap, HEAPENTRY *Entry)
Definition: oldheap.cpp:234
void * Data
Definition: oldheap.h:34
TEMPCLUSTER * candidates
Definition: cluster.cpp:194
CLUSTER * Cluster
Definition: cluster.cpp:163
FLOAT32 Key
Definition: oldheap.h:33
CLUSTER * Neighbor
Definition: cluster.cpp:164
PROTOTYPE * MakePrototype ( CLUSTERER Clusterer,
CLUSTERCONFIG Config,
CLUSTER Cluster 
)

MakePrototype *********************************************************** Parameters: Clusterer data structure holding cluster tree Config parameters used to control prototype generation Cluster cluster to be made into a prototype Operation: This routine attempts to create a prototype from the specified cluster that conforms to the distribution specified in Config. If there are too few samples in the cluster to perform a statistical analysis, then a prototype is generated but labelled as insignificant. If the dimensions of the cluster are not independent, no prototype is generated and NULL is returned. If a prototype can be found that matches the desired distribution then a pointer to it is returned, otherwise NULL is returned. Return: Pointer to new prototype or NULL Exceptions: None History: 6/19/89, DSJ, Created.

Definition at line 974 of file cluster.cpp.

976  {
977  STATISTICS *Statistics;
978  PROTOTYPE *Proto;
979  BUCKETS *Buckets;
980 
981  // filter out clusters which contain samples from the same character
982  if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
983  return NULL;
984 
985  // compute the covariance matrix and ranges for the cluster
986  Statistics =
987  ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
988 
989  // check for degenerate clusters which need not be analyzed further
990  // note that the MinSamples test assumes that all clusters with multiple
991  // character samples have been removed (as above)
992  Proto = MakeDegenerateProto(
993  Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
994  (inT32) (Config->MinSamples * Clusterer->NumChar));
995  if (Proto != NULL) {
996  FreeStatistics(Statistics);
997  return Proto;
998  }
999  // check to ensure that all dimensions are independent
1000  if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize,
1001  Statistics->CoVariance, Config->Independence)) {
1002  FreeStatistics(Statistics);
1003  return NULL;
1004  }
1005 
1006  if (HOTELLING && Config->ProtoStyle == elliptical) {
1007  Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1008  if (Proto != NULL) {
1009  FreeStatistics(Statistics);
1010  return Proto;
1011  }
1012  }
1013 
1014  // create a histogram data structure used to evaluate distributions
1015  Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount,
1016  Config->Confidence);
1017 
1018  // create a prototype based on the statistics and test it
1019  switch (Config->ProtoStyle) {
1020  case spherical:
1021  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1022  break;
1023  case elliptical:
1024  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1025  break;
1026  case mixed:
1027  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1028  Config->Confidence);
1029  break;
1030  case automatic:
1031  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1032  if (Proto != NULL)
1033  break;
1034  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1035  if (Proto != NULL)
1036  break;
1037  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1038  Config->Confidence);
1039  break;
1040  }
1041  FreeStatistics(Statistics);
1042  return Proto;
1043 } // MakePrototype
PROTOTYPE * MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1254
FLOAT64 Confidence
Definition: cluster.h:54
unsigned SampleCount
Definition: cluster.h:35
PARAM_DESC * ParamDesc
Definition: cluster.h:88
FLOAT32 Independence
Definition: cluster.h:53
STATISTICS * ComputeStatistics(inT16 N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
Definition: cluster.cpp:1422
inT32 NumChar
Definition: cluster.h:93
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1705
Definition: cluster.h:45
inT16 SampleSize
Definition: cluster.h:87
#define HOTELLING
Definition: cluster.cpp:29
void FreeStatistics(STATISTICS *Statistics)
Definition: cluster.cpp:2226
PROTOSTYLE ProtoStyle
Definition: cluster.h:49
PROTOTYPE * TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1109
BOOL8 MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, FLOAT32 MaxIllegal)
Definition: cluster.cpp:2565
PROTOTYPE * MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, FLOAT64 Confidence)
Definition: cluster.cpp:1298
BOOL8 Independent(PARAM_DESC ParamDesc[], inT16 N, FLOAT32 *CoVariance, FLOAT32 Independence)
Definition: cluster.cpp:1656
FLOAT32 MaxIllegal
Definition: cluster.h:51
PROTOTYPE * MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1216
PROTOTYPE * MakeDegenerateProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, inT32 MinSamples)
Definition: cluster.cpp:1067
FLOAT32 * CoVariance
Definition: cluster.cpp:169
FLOAT32 MinSamples
Definition: cluster.h:50
Definition: cluster.h:59
SAMPLE* MakeSample ( CLUSTERER Clusterer,
const FLOAT32 Feature,
inT32  CharID 
)

MakeSample *********************************************************** Parameters: Clusterer clusterer data structure to add sample to Feature feature to be added to clusterer CharID unique ident. of char that sample came from Operation: This routine creates a new sample data structure to hold the specified feature. This sample is added to the clusterer data structure (so that it knows which samples are to be clustered later), and a pointer to the sample is returned to the caller. Return: Pointer to the new sample data structure Exceptions: ALREADYCLUSTERED MakeSample can't be called after ClusterSamples has been called History: 5/29/89, DSJ, Created.

Definition at line 450 of file cluster.cpp.

451  {
452  SAMPLE *Sample;
453  int i;
454 
455  // see if the samples have already been clustered - if so trap an error
456  if (Clusterer->Root != NULL)
458  "Can't add samples after they have been clustered");
459 
460  // allocate the new sample and initialize it
461  Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
462  (Clusterer->SampleSize -
463  1) * sizeof (FLOAT32));
464  Sample->Clustered = FALSE;
465  Sample->Prototype = FALSE;
466  Sample->SampleCount = 1;
467  Sample->Left = NULL;
468  Sample->Right = NULL;
469  Sample->CharID = CharID;
470 
471  for (i = 0; i < Clusterer->SampleSize; i++)
472  Sample->Mean[i] = Feature[i];
473 
474  // add the sample to the KD tree - keep track of the total # of samples
475  Clusterer->NumberOfSamples++;
476  KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
477  if (CharID >= Clusterer->NumChar)
478  Clusterer->NumChar = CharID + 1;
479 
480  // execute hook for monitoring clustering operation
481  // (*SampleCreationHook)( Sample );
482 
483  return (Sample);
484 } // MakeSample
unsigned Prototype
Definition: cluster.h:34
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data)
Definition: kdtree.cpp:209
unsigned SampleCount
Definition: cluster.h:35
Definition: cluster.h:32
inT32 NumChar
Definition: cluster.h:93
#define NULL
Definition: host.h:144
void DoError(int Error, const char *Message)
Definition: danerror.cpp:32
#define FALSE
Definition: capi.h:28
float FLOAT32
Definition: host.h:111
inT16 SampleSize
Definition: cluster.h:87
CLUSTER * Root
Definition: cluster.h:91
FLOAT32 Mean[1]
Definition: cluster.h:39
struct sample * Right
Definition: cluster.h:37
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
inT32 NumberOfSamples
Definition: cluster.h:89
unsigned Clustered
Definition: cluster.h:33
struct sample * Left
Definition: cluster.h:36
#define ALREADYCLUSTERED
Definition: cluster.h:133
KDTREE * KDTree
Definition: cluster.h:90
inT32 CharID
Definition: cluster.h:38
PROTOTYPE * MakeSphericalProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS Buckets 
)

Definition at line 1216 of file cluster.cpp.

1219  {
1220  PROTOTYPE *Proto = NULL;
1221  int i;
1222 
1223  // check that each dimension is a normal distribution
1224  for (i = 0; i < Clusterer->SampleSize; i++) {
1225  if (Clusterer->ParamDesc[i].NonEssential)
1226  continue;
1227 
1228  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1229  Cluster->Mean[i],
1230  sqrt ((FLOAT64) (Statistics->AvgVariance)));
1231  if (!DistributionOK (Buckets))
1232  break;
1233  }
1234  // if all dimensions matched a normal distribution, make a proto
1235  if (i >= Clusterer->SampleSize)
1236  Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
1237  return (Proto);
1238 } // MakeSphericalProto
PROTOTYPE * NewSphericalProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1510
PARAM_DESC * ParamDesc
Definition: cluster.h:88
#define NULL
Definition: host.h:144
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2015
double FLOAT64
Definition: host.h:112
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2187
inT16 SampleSize
Definition: cluster.h:87
FLOAT32 AvgVariance
Definition: cluster.cpp:168
FLOAT32 Mean[1]
Definition: cluster.h:39
inT8 NonEssential
Definition: ocrfeatures.h:47
FLOAT32 Mean ( PROTOTYPE Proto,
uinT16  Dimension 
)

Mean *********************************************************** Parameters: Proto prototype to return mean of Dimension dimension whose mean is to be returned Operation: This routine returns the mean of the specified prototype in the indicated dimension. Return: Mean of Prototype in Dimension Exceptions: none History: 7/6/89, DSJ, Created.

Definition at line 639 of file cluster.cpp.

639  {
640  return (Proto->Mean[Dimension]);
641 } // Mean
FLOAT32 * Mean
Definition: cluster.h:78
inT32 MergeClusters ( inT16  N,
register PARAM_DESC  ParamDesc[],
register inT32  n1,
register inT32  n2,
register FLOAT32  m[],
register FLOAT32  m1[],
register FLOAT32  m2[] 
)
inT32 MergeClusters ( inT16  N,
PARAM_DESC  ParamDesc[],
inT32  n1,
inT32  n2,
FLOAT32  m[],
FLOAT32  m1[],
FLOAT32  m2[] 
)

MergeClusters ************************************************************ Parameters: N # of dimensions (size of arrays) ParamDesc array of dimension descriptions n1, n2 number of samples in each old cluster m array to hold mean of new cluster m1, m2 arrays containing means of old clusters Operation: This routine merges two clusters into one larger cluster. To do this it computes the number of samples in the new cluster and the mean of the new cluster. The ParamDesc information is used to ensure that circular dimensions are handled correctly. Return: The number of samples in the new cluster. Exceptions: None History: 5/31/89, DSJ, Created.

Definition at line 882 of file cluster.cpp.

887  {
888  inT32 i, n;
889 
890  n = n1 + n2;
891  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
892  if (ParamDesc->Circular) {
893  // if distance between means is greater than allowed
894  // reduce upper point by one "rotation" to compute mean
895  // then normalize the mean back into the accepted range
896  if ((*m2 - *m1) > ParamDesc->HalfRange) {
897  *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
898  if (*m < ParamDesc->Min)
899  *m += ParamDesc->Range;
900  }
901  else if ((*m1 - *m2) > ParamDesc->HalfRange) {
902  *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
903  if (*m < ParamDesc->Min)
904  *m += ParamDesc->Range;
905  }
906  else
907  *m = (n1 * *m1 + n2 * *m2) / n;
908  }
909  else
910  *m = (n1 * *m1 + n2 * *m2) / n;
911  }
912  return n;
913 } // MergeClusters
int inT32
Definition: host.h:102
BOOL8 MultipleCharSamples ( CLUSTERER Clusterer,
CLUSTER Cluster,
FLOAT32  MaxIllegal 
)

Definition at line 2565 of file cluster.cpp.

2596 {
2597  static BOOL8 *CharFlags = NULL;
2598  static inT32 NumFlags = 0;
2599  int i;
2600  LIST SearchState;
2601  SAMPLE *Sample;
2602  inT32 CharID;
2603  inT32 NumCharInCluster;
2604  inT32 NumIllegalInCluster;
2605  FLOAT32 PercentIllegal;
2606 
2607  // initial estimate assumes that no illegal chars exist in the cluster
2608  NumCharInCluster = Cluster->SampleCount;
2609  NumIllegalInCluster = 0;
2610 
2611  if (Clusterer->NumChar > NumFlags) {
2612  if (CharFlags != NULL)
2613  memfree(CharFlags);
2614  NumFlags = Clusterer->NumChar;
2615  CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
2616  }
2617 
2618  for (i = 0; i < NumFlags; i++)
2619  CharFlags[i] = FALSE;
2620 
2621  // find each sample in the cluster and check if we have seen it before
2622  InitSampleSearch(SearchState, Cluster);
2623  while ((Sample = NextSample (&SearchState)) != NULL) {
2624  CharID = Sample->CharID;
2625  if (CharFlags[CharID] == FALSE) {
2626  CharFlags[CharID] = TRUE;
2627  }
2628  else {
2629  if (CharFlags[CharID] == TRUE) {
2630  NumIllegalInCluster++;
2631  CharFlags[CharID] = ILLEGAL_CHAR;
2632  }
2633  NumCharInCluster--;
2634  PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
2635  if (PercentIllegal > MaxIllegal) {
2636  destroy(SearchState);
2637  return (TRUE);
2638  }
2639  }
2640  }
2641  return (FALSE);
2642 
2643 } // MultipleCharSamples
void memfree(void *element)
Definition: freelist.cpp:30
unsigned SampleCount
Definition: cluster.h:35
#define ILLEGAL_CHAR
Definition: cluster.h:32
inT32 NumChar
Definition: cluster.h:93
unsigned char BOOL8
Definition: host.h:113
LIST destroy(LIST list)
Definition: oldlist.cpp:187
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
#define FALSE
Definition: capi.h:28
float FLOAT32
Definition: host.h:111
#define InitSampleSearch(S, C)
Definition: cluster.h:105
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:614
inT32 CharID
Definition: cluster.h:38
#define TRUE
Definition: capi.h:27
CHISTRUCT * NewChiStruct ( uinT16  DegreesOfFreedom,
FLOAT64  Alpha 
)

Definition at line 2432 of file cluster.cpp.

2432  {
2433 /*
2434  ** Parameters:
2435  ** DegreesOfFreedom degrees of freedom for new chi value
2436  ** Alpha confidence level for new chi value
2437  ** Operation:
2438  ** This routine allocates a new data structure which is used
2439  ** to hold a chi-squared value along with its associated
2440  ** number of degrees of freedom and alpha value.
2441  ** Return: none
2442  ** Exceptions: none
2443  ** History: Fri Aug 4 11:04:59 1989, DSJ, Created.
2444  */
2446 
2447  NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
2448  NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
2449  NewChiStruct->Alpha = Alpha;
2450  return (NewChiStruct);
2451 
2452 } // NewChiStruct
FLOAT64 Alpha
Definition: cluster.cpp:187
CHISTRUCT * NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:2432
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2286
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
uinT16 DegreesOfFreedom
Definition: cluster.cpp:186
PROTOTYPE * NewEllipticalProto ( inT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

NewEllipticalProto ******************************************************* Parameters: N number of dimensions Cluster cluster to be made into an elliptical prototype Statistics statistical info about samples in cluster Operation: This routine creates an elliptical prototype data structure to approximate the samples in the specified cluster. Elliptical prototypes have a variance for each dimension. All dimensions are normally distributed and independent. Return: Pointer to a new elliptical prototype data structure Exceptions: None History: 6/19/89, DSJ, Created.

Definition at line 1544 of file cluster.cpp.

1546  {
1547  PROTOTYPE *Proto;
1548  FLOAT32 *CoVariance;
1549  int i;
1550 
1551  Proto = NewSimpleProto (N, Cluster);
1552  Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1553  Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1554  Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1555 
1556  CoVariance = Statistics->CoVariance;
1557  Proto->TotalMagnitude = 1.0;
1558  for (i = 0; i < N; i++, CoVariance += N + 1) {
1559  Proto->Variance.Elliptical[i] = *CoVariance;
1560  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1561  Proto->Variance.Elliptical[i] = MINVARIANCE;
1562 
1563  Proto->Magnitude.Elliptical[i] =
1564  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
1565  Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
1566  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1567  }
1568  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1569  Proto->Style = elliptical;
1570  return (Proto);
1571 } // NewEllipticalProto
PROTOTYPE * NewSimpleProto(inT16 N, CLUSTER *Cluster)
Definition: cluster.cpp:1614
FLOAT32 TotalMagnitude
Definition: cluster.h:79
unsigned Style
Definition: cluster.h:74
FLOAT32 * Elliptical
Definition: cluster.h:64
FLOATUNION Variance
Definition: cluster.h:81
#define PI
Definition: const.h:19
float FLOAT32
Definition: host.h:111
#define MINVARIANCE
Definition: cluster.cpp:141
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Weight
Definition: cluster.h:83
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
FLOAT32 * CoVariance
Definition: cluster.cpp:169
FLOATUNION Magnitude
Definition: cluster.h:82
PROTOTYPE * NewMixedProto ( inT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

MewMixedProto ************************************************************ Parameters: N number of dimensions Cluster cluster to be made into a mixed prototype Statistics statistical info about samples in cluster Operation: This routine creates a mixed prototype data structure to approximate the samples in the specified cluster. Mixed prototypes can have different distributions for each dimension. All dimensions are independent. The structure is initially filled in as though it were an elliptical prototype. The actual distributions of the dimensions can be altered by other routines. Return: Pointer to a new mixed prototype data structure Exceptions: None History: 6/19/89, DSJ, Created.

Definition at line 1589 of file cluster.cpp.

1589  {
1590  PROTOTYPE *Proto;
1591  int i;
1592 
1593  Proto = NewEllipticalProto (N, Cluster, Statistics);
1594  Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
1595 
1596  for (i = 0; i < N; i++) {
1597  Proto->Distrib[i] = normal;
1598  }
1599  Proto->Style = mixed;
1600  return (Proto);
1601 } // NewMixedProto
DISTRIBUTION * Distrib
Definition: cluster.h:77
unsigned Style
Definition: cluster.h:74
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1544
DISTRIBUTION
Definition: cluster.h:58
Definition: cluster.h:45
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
Definition: cluster.h:59
PROTOTYPE * NewSimpleProto ( inT16  N,
CLUSTER Cluster 
)

NewSimpleProto *********************************************************** Parameters: N number of dimensions Cluster cluster to be made into a prototype Operation: This routine allocates memory to hold a simple prototype data structure, i.e. one without independent distributions and variances for each dimension. Return: Pointer to new simple prototype Exceptions: None History: 6/19/89, DSJ, Created.

Definition at line 1614 of file cluster.cpp.

1614  {
1615  PROTOTYPE *Proto;
1616  int i;
1617 
1618  Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
1619  Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1620 
1621  for (i = 0; i < N; i++)
1622  Proto->Mean[i] = Cluster->Mean[i];
1623  Proto->Distrib = NULL;
1624 
1625  Proto->Significant = TRUE;
1626  Proto->Merged = FALSE;
1627  Proto->Style = spherical;
1628  Proto->NumSamples = Cluster->SampleCount;
1629  Proto->Cluster = Cluster;
1630  Proto->Cluster->Prototype = TRUE;
1631  return (Proto);
1632 } // NewSimpleProto
unsigned Prototype
Definition: cluster.h:34
DISTRIBUTION * Distrib
Definition: cluster.h:77
unsigned Merged
Definition: cluster.h:69
unsigned NumSamples
Definition: cluster.h:75
unsigned SampleCount
Definition: cluster.h:35
unsigned Style
Definition: cluster.h:74
#define NULL
Definition: host.h:144
#define FALSE
Definition: capi.h:28
float FLOAT32
Definition: host.h:111
FLOAT32 Mean[1]
Definition: cluster.h:39
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
unsigned Significant
Definition: cluster.h:68
CLUSTER * Cluster
Definition: cluster.h:76
FLOAT32 * Mean
Definition: cluster.h:78
#define TRUE
Definition: capi.h:27
PROTOTYPE * NewSphericalProto ( uinT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

NewSpericalProto ********************************************************* Parameters: N number of dimensions Cluster cluster to be made into a spherical prototype Statistics statistical info about samples in cluster Operation: This routine creates a spherical prototype data structure to approximate the samples in the specified cluster. Spherical prototypes have a single variance which is common across all dimensions. All dimensions are normally distributed and independent. Return: Pointer to a new spherical prototype data structure Exceptions: None History: 6/19/89, DSJ, Created.

Definition at line 1510 of file cluster.cpp.

1512  {
1513  PROTOTYPE *Proto;
1514 
1515  Proto = NewSimpleProto (N, Cluster);
1516 
1517  Proto->Variance.Spherical = Statistics->AvgVariance;
1518  if (Proto->Variance.Spherical < MINVARIANCE)
1519  Proto->Variance.Spherical = MINVARIANCE;
1520 
1521  Proto->Magnitude.Spherical =
1522  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
1523  Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
1524  (double) N);
1525  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
1526  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1527 
1528  return (Proto);
1529 } // NewSphericalProto
PROTOTYPE * NewSimpleProto(inT16 N, CLUSTER *Cluster)
Definition: cluster.cpp:1614
FLOAT32 TotalMagnitude
Definition: cluster.h:79
FLOAT32 Spherical
Definition: cluster.h:63
FLOATUNION Variance
Definition: cluster.h:81
#define PI
Definition: const.h:19
#define MINVARIANCE
Definition: cluster.cpp:141
FLOAT32 AvgVariance
Definition: cluster.cpp:168
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Weight
Definition: cluster.h:83
FLOATUNION Magnitude
Definition: cluster.h:82
CLUSTER* NextSample ( LIST SearchState)

NextSample ************************************************************ Parameters: SearchState ptr to list containing clusters to be searched Operation: This routine is used to find all of the samples which belong to a cluster. It starts by removing the top cluster on the cluster list (SearchState). If this cluster is a leaf it is returned. Otherwise, the right subcluster is pushed on the list and we continue the search in the left subcluster. This continues until a leaf is found. If all samples have been found, NULL is returned. InitSampleSearch() must be called before NextSample() to initialize the search. Return: Pointer to the next leaf cluster (sample) or NULL. Exceptions: None History: 6/16/89, DSJ, Created.

Definition at line 614 of file cluster.cpp.

614  {
615  CLUSTER *Cluster;
616 
617  if (*SearchState == NIL_LIST)
618  return (NULL);
619  Cluster = (CLUSTER *) first_node (*SearchState);
620  *SearchState = pop (*SearchState);
621  while (TRUE) {
622  if (Cluster->Left == NULL)
623  return (Cluster);
624  *SearchState = push (*SearchState, Cluster->Right);
625  Cluster = Cluster->Left;
626  }
627 } // NextSample
LIST pop(LIST list)
Definition: oldlist.cpp:305
Definition: cluster.h:32
#define NIL_LIST
Definition: oldlist.h:126
#define NULL
Definition: host.h:144
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
struct sample * Right
Definition: cluster.h:37
struct sample * Left
Definition: cluster.h:36
#define first_node(l)
Definition: oldlist.h:139
#define TRUE
Definition: capi.h:27
uinT16 NormalBucket ( PARAM_DESC ParamDesc,
FLOAT32  x,
FLOAT32  Mean,
FLOAT32  StdDev 
)

Definition at line 2103 of file cluster.cpp.

2106  {
2107 /*
2108  ** Parameters:
2109  ** ParamDesc used to identify circular dimensions
2110  ** x value to be normalized
2111  ** Mean mean of normal distribution
2112  ** StdDev standard deviation of normal distribution
2113  ** Operation:
2114  ** This routine determines which bucket x falls into in the
2115  ** discrete normal distribution defined by kNormalMean
2116  ** and kNormalStdDev. x values which exceed the range of
2117  ** the discrete distribution are clipped.
2118  ** Return:
2119  ** Bucket number into which x falls
2120  ** Exceptions:
2121  ** None
2122  ** History:
2123  ** 6/5/89, DSJ, Created.
2124  */
2125  FLOAT32 X;
2126 
2127  // wraparound circular parameters if necessary
2128  if (ParamDesc->Circular) {
2129  if (x - Mean > ParamDesc->HalfRange)
2130  x -= ParamDesc->Range;
2131  else if (x - Mean < -ParamDesc->HalfRange)
2132  x += ParamDesc->Range;
2133  }
2134 
2135  X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
2136  if (X < 0)
2137  return 0;
2138  if (X > BUCKETTABLESIZE - 1)
2139  return ((uinT16) (BUCKETTABLESIZE - 1));
2140  return (uinT16) floor((FLOAT64) X);
2141 } // NormalBucket
inT8 Circular
Definition: ocrfeatures.h:46
FLOAT32 HalfRange
Definition: ocrfeatures.h:51
FLOAT32 Range
Definition: ocrfeatures.h:50
float FLOAT32
Definition: host.h:111
double FLOAT64
Definition: host.h:112
unsigned short uinT16
Definition: host.h:101
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:639
#define BUCKETTABLESIZE
Definition: cluster.cpp:159
FLOAT64 NormalDensity ( inT32  x)

Definition at line 1940 of file cluster.cpp.

1940  {
1941 /*
1942  ** Parameters:
1943  ** x number to compute the normal probability density for
1944  ** Globals:
1945  ** kNormalMean mean of a discrete normal distribution
1946  ** kNormalVariance variance of a discrete normal distribution
1947  ** kNormalMagnitude magnitude of a discrete normal distribution
1948  ** Operation:
1949  ** This routine computes the probability density function
1950  ** of a discrete normal distribution defined by the global
1951  ** variables kNormalMean, kNormalVariance, and kNormalMagnitude.
1952  ** Normal magnitude could, of course, be computed in terms of
1953  ** the normal variance but it is precomputed for efficiency.
1954  ** Return:
1955  ** The value of the normal distribution at x.
1956  ** Exceptions:
1957  ** None
1958  ** History:
1959  ** 6/4/89, DSJ, Created.
1960  */
1961  FLOAT64 Distance;
1962 
1963  Distance = x - kNormalMean;
1964  return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
1965 } // NormalDensity
double FLOAT64
Definition: host.h:112
int NumBucketsMatch ( void *  arg1,
void *  arg2 
)

Definition at line 2316 of file cluster.cpp.

2317  { // uinT16 *DesiredNumberOfBuckets)
2318 /*
2319  ** Parameters:
2320  ** Histogram current histogram being tested for a match
2321  ** DesiredNumberOfBuckets match key
2322  ** Operation:
2323  ** This routine is used to search a list of histogram data
2324  ** structures to find one with the specified number of
2325  ** buckets. It is called by the list search routines.
2326  ** Return: TRUE if Histogram matches DesiredNumberOfBuckets
2327  ** Exceptions: none
2328  ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2329  */
2330  BUCKETS *Histogram = (BUCKETS *) arg1;
2331  uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
2332 
2333  return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
2334 
2335 } // NumBucketsMatch
uinT16 NumberOfBuckets
Definition: cluster.cpp:179
unsigned short uinT16
Definition: host.h:101
uinT16 OptimumNumberOfBuckets ( uinT32  SampleCount)

Definition at line 1839 of file cluster.cpp.

1839  {
1840 /*
1841  ** Parameters:
1842  ** SampleCount number of samples to be tested
1843  ** Operation:
1844  ** This routine computes the optimum number of histogram
1845  ** buckets that should be used in a chi-squared goodness of
1846  ** fit test for the specified number of samples. The optimum
1847  ** number is computed based on Table 4.1 on pg. 147 of
1848  ** "Measurement and Analysis of Random Data" by Bendat & Piersol.
1849  ** Linear interpolation is used to interpolate between table
1850  ** values. The table is intended for a 0.05 level of
1851  ** significance (alpha). This routine assumes that it is
1852  ** equally valid for other alpha's, which may not be true.
1853  ** Return:
1854  ** Optimum number of histogram buckets
1855  ** Exceptions:
1856  ** None
1857  ** History:
1858  ** 6/5/89, DSJ, Created.
1859  */
1860  uinT8 Last, Next;
1861  FLOAT32 Slope;
1862 
1863  if (SampleCount < kCountTable[0])
1864  return kBucketsTable[0];
1865 
1866  for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
1867  if (SampleCount <= kCountTable[Next]) {
1868  Slope = (FLOAT32) (kBucketsTable[Next] - kBucketsTable[Last]) /
1869  (FLOAT32) (kCountTable[Next] - kCountTable[Last]);
1870  return ((uinT16) (kBucketsTable[Last] +
1871  Slope * (SampleCount - kCountTable[Last])));
1872  }
1873  }
1874  return kBucketsTable[Last];
1875 } // OptimumNumberOfBuckets
#define LOOKUPTABLESIZE
Definition: cluster.cpp:224
float FLOAT32
Definition: host.h:111
unsigned short uinT16
Definition: host.h:101
unsigned char uinT8
Definition: host.h:99
FLOAT64 Solve ( SOLVEFUNC  Function,
void *  FunctionParams,
FLOAT64  InitialGuess,
FLOAT64  Accuracy 
)

Definition at line 2457 of file cluster.cpp.

2477 {
2478  FLOAT64 x;
2479  FLOAT64 f;
2480  FLOAT64 Slope;
2481  FLOAT64 Delta;
2482  FLOAT64 NewDelta;
2483  FLOAT64 xDelta;
2484  FLOAT64 LastPosX, LastNegX;
2485 
2486  x = InitialGuess;
2487  Delta = INITIALDELTA;
2488  LastPosX = MAX_FLOAT32;
2489  LastNegX = -MAX_FLOAT32;
2490  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2491  while (Abs (LastPosX - LastNegX) > Accuracy) {
2492  // keep track of outer bounds of current estimate
2493  if (f < 0)
2494  LastNegX = x;
2495  else
2496  LastPosX = x;
2497 
2498  // compute the approx. slope of f(x) at the current point
2499  Slope =
2500  ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
2501 
2502  // compute the next solution guess */
2503  xDelta = f / Slope;
2504  x -= xDelta;
2505 
2506  // reduce the delta used for computing slope to be a fraction of
2507  //the amount moved to get to the new guess
2508  NewDelta = Abs (xDelta) * DELTARATIO;
2509  if (NewDelta < Delta)
2510  Delta = NewDelta;
2511 
2512  // compute the value of the function at the new guess
2513  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2514  }
2515  return (x);
2516 
2517 } // Solve
#define Abs(N)
Definition: cluster.cpp:204
#define f(xc, yc)
Definition: imgscale.cpp:39
#define DELTARATIO
double FLOAT64
Definition: host.h:112
#define INITIALDELTA
#define MAX_FLOAT32
Definition: host.h:124
FLOAT32 StandardDeviation ( PROTOTYPE Proto,
uinT16  Dimension 
)

StandardDeviation ************************************************* Parameters: Proto prototype to return standard deviation of Dimension dimension whose stddev is to be returned Operation: This routine returns the standard deviation of the prototype in the indicated dimension. Return: Standard deviation of Prototype in Dimension Exceptions: none History: 7/6/89, DSJ, Created.

Definition at line 653 of file cluster.cpp.

653  {
654  switch (Proto->Style) {
655  case spherical:
656  return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
657  case elliptical:
658  return ((FLOAT32)
659  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
660  case mixed:
661  switch (Proto->Distrib[Dimension]) {
662  case normal:
663  return ((FLOAT32)
664  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
665  case uniform:
666  case D_random:
667  return (Proto->Variance.Elliptical[Dimension]);
668  case DISTRIBUTION_COUNT:
669  ASSERT_HOST(!"Distribution count not allowed!");
670  }
671  }
672  return 0.0f;
673 } // StandardDeviation
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 Spherical
Definition: cluster.h:63
unsigned Style
Definition: cluster.h:74
FLOAT32 * Elliptical
Definition: cluster.h:64
FLOATUNION Variance
Definition: cluster.h:81
float FLOAT32
Definition: host.h:111
Definition: cluster.h:45
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: cluster.h:59
PROTOTYPE * TestEllipticalProto ( CLUSTERER Clusterer,
CLUSTERCONFIG Config,
CLUSTER Cluster,
STATISTICS Statistics 
)

TestEllipticalProto **************************************************** Parameters: Clusterer data struct containing samples being clustered Config provides the magic number of samples that make a good cluster Cluster cluster to be made into an elliptical prototype Statistics statistical info about cluster Operation: This routine tests the specified cluster to see if ** there is a statistically significant difference between the sub-clusters that would be made if the cluster were to be split. If not, then a new prototype is formed and returned to the caller. If there is, then NULL is returned to the caller. Return: Pointer to new elliptical prototype or NULL.

Definition at line 1109 of file cluster.cpp.

1112  {
1113  // Fraction of the number of samples used as a range around 1 within
1114  // which a cluster has the magic size that allows a boost to the
1115  // FTable by kFTableBoostMargin, thus allowing clusters near the
1116  // magic size (equal to the number of sample characters) to be more
1117  // likely to stay together.
1118  const double kMagicSampleMargin = 0.0625;
1119  const double kFTableBoostMargin = 2.0;
1120 
1121  int N = Clusterer->SampleSize;
1122  CLUSTER* Left = Cluster->Left;
1123  CLUSTER* Right = Cluster->Right;
1124  if (Left == NULL || Right == NULL)
1125  return NULL;
1126  int TotalDims = Left->SampleCount + Right->SampleCount;
1127  if (TotalDims < N + 1 || TotalDims < 2)
1128  return NULL;
1129  const int kMatrixSize = N * N * sizeof(FLOAT32);
1130  FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1131  FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1132  FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
1133  // Compute a new covariance matrix that only uses essential features.
1134  for (int i = 0; i < N; ++i) {
1135  int row_offset = i * N;
1136  if (!Clusterer->ParamDesc[i].NonEssential) {
1137  for (int j = 0; j < N; ++j) {
1138  if (!Clusterer->ParamDesc[j].NonEssential)
1139  Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
1140  else
1141  Covariance[j + row_offset] = 0.0f;
1142  }
1143  } else {
1144  for (int j = 0; j < N; ++j) {
1145  if (i == j)
1146  Covariance[j + row_offset] = 1.0f;
1147  else
1148  Covariance[j + row_offset] = 0.0f;
1149  }
1150  }
1151  }
1152  double err = InvertMatrix(Covariance, N, Inverse);
1153  if (err > 1) {
1154  tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
1155  }
1156  int EssentialN = 0;
1157  for (int dim = 0; dim < N; ++dim) {
1158  if (!Clusterer->ParamDesc[dim].NonEssential) {
1159  Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
1160  ++EssentialN;
1161  } else {
1162  Delta[dim] = 0.0f;
1163  }
1164  }
1165  // Compute Hotelling's T-squared.
1166  double Tsq = 0.0;
1167  for (int x = 0; x < N; ++x) {
1168  double temp = 0.0;
1169  for (int y = 0; y < N; ++y) {
1170  temp += Inverse[y + N*x] * Delta[y];
1171  }
1172  Tsq += Delta[x] * temp;
1173  }
1174  memfree(Covariance);
1175  memfree(Inverse);
1176  memfree(Delta);
1177  // Changed this function to match the formula in
1178  // Statistical Methods in Medical Research p 473
1179  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
1180  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
1181  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1182  int Fx = EssentialN;
1183  if (Fx > FTABLE_X)
1184  Fx = FTABLE_X;
1185  --Fx;
1186  int Fy = TotalDims - EssentialN - 1;
1187  if (Fy > FTABLE_Y)
1188  Fy = FTABLE_Y;
1189  --Fy;
1190  double FTarget = FTable[Fy][Fx];
1191  if (Config->MagicSamples > 0 &&
1192  TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
1193  TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
1194  // Give magic-sized clusters a magic FTable boost.
1195  FTarget += kFTableBoostMargin;
1196  }
1197  if (F < FTarget) {
1198  return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1199  }
1200  return NULL;
1201 }
const double FTable[FTABLE_Y][FTABLE_X]
Definition: cluster.cpp:34
void memfree(void *element)
Definition: freelist.cpp:30
unsigned SampleCount
Definition: cluster.h:35
PARAM_DESC * ParamDesc
Definition: cluster.h:88
double InvertMatrix(const float *input, int size, float *inv)
Definition: cluster.cpp:2648
Definition: cluster.h:32
#define FTABLE_Y
Definition: cluster.cpp:31
#define NULL
Definition: host.h:144
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1544
#define FTABLE_X
Definition: cluster.cpp:30
float FLOAT32
Definition: host.h:111
inT16 SampleSize
Definition: cluster.h:87
FLOAT32 Mean[1]
Definition: cluster.h:39
struct sample * Right
Definition: cluster.h:37
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
inT8 NonEssential
Definition: ocrfeatures.h:47
FLOAT32 * CoVariance
Definition: cluster.cpp:169
struct sample * Left
Definition: cluster.h:36
int MagicSamples
Definition: cluster.h:55
uinT16 UniformBucket ( PARAM_DESC ParamDesc,
FLOAT32  x,
FLOAT32  Mean,
FLOAT32  StdDev 
)

Definition at line 2145 of file cluster.cpp.

2148  {
2149 /*
2150  ** Parameters:
2151  ** ParamDesc used to identify circular dimensions
2152  ** x value to be normalized
2153  ** Mean center of range of uniform distribution
2154  ** StdDev 1/2 the range of the uniform distribution
2155  ** Operation:
2156  ** This routine determines which bucket x falls into in the
2157  ** discrete uniform distribution defined by
2158  ** BUCKETTABLESIZE. x values which exceed the range of
2159  ** the discrete distribution are clipped.
2160  ** Return:
2161  ** Bucket number into which x falls
2162  ** Exceptions:
2163  ** None
2164  ** History:
2165  ** 6/5/89, DSJ, Created.
2166  */
2167  FLOAT32 X;
2168 
2169  // wraparound circular parameters if necessary
2170  if (ParamDesc->Circular) {
2171  if (x - Mean > ParamDesc->HalfRange)
2172  x -= ParamDesc->Range;
2173  else if (x - Mean < -ParamDesc->HalfRange)
2174  x += ParamDesc->Range;
2175  }
2176 
2177  X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2178  if (X < 0)
2179  return 0;
2180  if (X > BUCKETTABLESIZE - 1)
2181  return (uinT16) (BUCKETTABLESIZE - 1);
2182  return (uinT16) floor((FLOAT64) X);
2183 } // UniformBucket
inT8 Circular
Definition: ocrfeatures.h:46
FLOAT32 HalfRange
Definition: ocrfeatures.h:51
FLOAT32 Range
Definition: ocrfeatures.h:50
float FLOAT32
Definition: host.h:111
double FLOAT64
Definition: host.h:112
unsigned short uinT16
Definition: host.h:101
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:639
#define BUCKETTABLESIZE
Definition: cluster.cpp:159
FLOAT64 UniformDensity ( inT32  x)

Definition at line 1969 of file cluster.cpp.

1969  {
1970 /*
1971  ** Parameters:
1972  ** x number to compute the uniform probability density for
1973  ** Operation:
1974  ** This routine computes the probability density function
1975  ** of a uniform distribution at the specified point. The
1976  ** range of the distribution is from 0 to BUCKETTABLESIZE.
1977  ** Return:
1978  ** The value of the uniform distribution at x.
1979  ** Exceptions:
1980  ** None
1981  ** History:
1982  ** 6/5/89, DSJ, Created.
1983  */
1984  static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
1985 
1986  if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
1987  return UniformDistributionDensity;
1988  else
1989  return (FLOAT64) 0.0;
1990 } // UniformDensity
double FLOAT64
Definition: host.h:112
#define BUCKETTABLESIZE
Definition: cluster.cpp:159

Variable Documentation

const double FTable[FTABLE_Y][FTABLE_X]

Definition at line 34 of file cluster.cpp.