Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cluster.h File Reference
#include "kdtree.h"
#include "oldlist.h"

Go to the source code of this file.

Classes

struct  sample
 
struct  CLUSTERCONFIG
 
union  FLOATUNION
 
struct  PROTOTYPE
 
struct  CLUSTERER
 
struct  SAMPLELIST
 

Macros

#define MINBUCKETS   5
 
#define MAXBUCKETS   39
 
#define InitSampleSearch(S, C)   (((C)==NULL)?(S=NIL_LIST):(S=push(NIL_LIST,(C))))
 
#define ALREADYCLUSTERED   4000
 

Typedefs

typedef struct sample CLUSTER
 
typedef CLUSTER SAMPLE
 

Enumerations

enum  PROTOSTYLE { spherical, elliptical, mixed, automatic }
 
enum  DISTRIBUTION { normal, uniform, D_random, DISTRIBUTION_COUNT }
 

Functions

CLUSTERERMakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[])
 
SAMPLEMakeSample (CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
 
LIST ClusterSamples (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void FreeClusterer (CLUSTERER *Clusterer)
 
void FreeProtoList (LIST *ProtoList)
 
void FreePrototype (void *arg)
 
CLUSTERNextSample (LIST *SearchState)
 
FLOAT32 Mean (PROTOTYPE *Proto, uinT16 Dimension)
 
FLOAT32 StandardDeviation (PROTOTYPE *Proto, uinT16 Dimension)
 
inT32 MergeClusters (inT16 N, PARAM_DESC ParamDesc[], inT32 n1, inT32 n2, FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[])
 

Macro Definition Documentation

#define ALREADYCLUSTERED   4000

Definition at line 133 of file cluster.h.

#define InitSampleSearch (   S,
 
)    (((C)==NULL)?(S=NIL_LIST):(S=push(NIL_LIST,(C))))

Definition at line 105 of file cluster.h.

#define MAXBUCKETS   39

Definition at line 27 of file cluster.h.

#define MINBUCKETS   5

Definition at line 26 of file cluster.h.

Typedef Documentation

typedef struct sample CLUSTER
typedef CLUSTER SAMPLE

Definition at line 42 of file cluster.h.

Enumeration Type Documentation

Enumerator
normal 
uniform 
D_random 
DISTRIBUTION_COUNT 

Definition at line 58 of file cluster.h.

enum PROTOSTYLE
Enumerator
spherical 
elliptical 
mixed 
automatic 

Definition at line 44 of file cluster.h.

44  {
46 } PROTOSTYLE;
Definition: cluster.h:45
PROTOSTYLE
Definition: cluster.h:44

Function Documentation

LIST ClusterSamples ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

ClusterSamples *********************************************************** Parameters: Clusterer data struct containing samples to be clustered Config parameters which control clustering process Operation: This routine first checks to see if the samples in this clusterer have already been clustered before; if so, it does not bother to recreate the cluster tree. It simply recomputes the prototypes based on the new Config info. If the samples have not been clustered before, the samples in the KD tree are formed into a cluster tree and then the prototypes are computed from the cluster tree. In either case this routine returns a pointer to a list of prototypes that best represent the samples given the constraints specified in Config. Return: Pointer to a list of prototypes Exceptions: None History: 5/29/89, DSJ, Created.

Definition at line 504 of file cluster.cpp.

504  {
505  //only create cluster tree if samples have never been clustered before
506  if (Clusterer->Root == NULL)
507  CreateClusterTree(Clusterer);
508 
509  //deallocate the old prototype list if one exists
510  FreeProtoList (&Clusterer->ProtoList);
511  Clusterer->ProtoList = NIL_LIST;
512 
513  //compute prototypes starting at the root node in the tree
514  ComputePrototypes(Clusterer, Config);
515  return (Clusterer->ProtoList);
516 } // ClusterSamples
#define NIL_LIST
Definition: oldlist.h:126
#define NULL
Definition: host.h:144
void CreateClusterTree(CLUSTERER *Clusterer)
Definition: cluster.cpp:694
void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:927
CLUSTER * Root
Definition: cluster.h:91
LIST ProtoList
Definition: cluster.h:92
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:560
void FreeClusterer ( CLUSTERER Clusterer)

FreeClusterer ************************************************************* Parameters: Clusterer pointer to data structure to be freed Operation: This routine frees all of the memory allocated to the specified data structure. It will not, however, free the memory used by the prototype list. The pointers to the clusters for each prototype in the list will be set to NULL to indicate that the cluster data structures no longer exist. Any sample lists that have been obtained via calls to GetSamples are no longer valid. Return: None Exceptions: None History: 6/6/89, DSJ, Created.

Definition at line 532 of file cluster.cpp.

532  {
533  if (Clusterer != NULL) {
534  memfree (Clusterer->ParamDesc);
535  if (Clusterer->KDTree != NULL)
536  FreeKDTree (Clusterer->KDTree);
537  if (Clusterer->Root != NULL)
538  FreeCluster (Clusterer->Root);
539  // Free up all used buckets structures.
540  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
541  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
542  if (Clusterer->bucket_cache[d][c] != NULL)
543  FreeBuckets(Clusterer->bucket_cache[d][c]);
544  }
545 
546  memfree(Clusterer);
547  }
548 } // FreeClusterer
void FreeBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2248
void memfree(void *element)
Definition: freelist.cpp:30
PARAM_DESC * ParamDesc
Definition: cluster.h:88
#define NULL
Definition: host.h:144
#define MAXBUCKETS
Definition: cluster.h:27
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:346
CLUSTER * Root
Definition: cluster.h:91
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2262
#define MINBUCKETS
Definition: cluster.h:26
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1-MINBUCKETS]
Definition: cluster.h:95
KDTREE * KDTree
Definition: cluster.h:90
void FreeProtoList ( LIST ProtoList)

FreeProtoList ************************************************************ Parameters: ProtoList pointer to list of prototypes to be freed Operation: This routine frees all of the memory allocated to the specified list of prototypes. The clusters which are pointed to by the prototypes are not freed. Return: None Exceptions: None History: 6/6/89, DSJ, Created.

Definition at line 560 of file cluster.cpp.

560  {
561  destroy_nodes(*ProtoList, FreePrototype);
562 } // FreeProtoList
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:204
void FreePrototype(void *arg)
Definition: cluster.cpp:575
void FreePrototype ( void *  arg)

FreePrototype ************************************************************ Parameters: Prototype prototype data structure to be deallocated Operation: This routine deallocates the memory consumed by the specified prototype and modifies the corresponding cluster so that it is no longer marked as a prototype. The cluster is NOT deallocated by this routine. Return: None Exceptions: None History: 5/30/89, DSJ, Created.

Definition at line 575 of file cluster.cpp.

575  { //PROTOTYPE *Prototype)
576  PROTOTYPE *Prototype = (PROTOTYPE *) arg;
577 
578  // unmark the corresponding cluster (if there is one
579  if (Prototype->Cluster != NULL)
580  Prototype->Cluster->Prototype = FALSE;
581 
582  // deallocate the prototype statistics and then the prototype itself
583  if (Prototype->Distrib != NULL)
584  memfree (Prototype->Distrib);
585  if (Prototype->Mean != NULL)
586  memfree (Prototype->Mean);
587  if (Prototype->Style != spherical) {
588  if (Prototype->Variance.Elliptical != NULL)
589  memfree (Prototype->Variance.Elliptical);
590  if (Prototype->Magnitude.Elliptical != NULL)
591  memfree (Prototype->Magnitude.Elliptical);
592  if (Prototype->Weight.Elliptical != NULL)
593  memfree (Prototype->Weight.Elliptical);
594  }
595  memfree(Prototype);
596 } // FreePrototype
unsigned Prototype
Definition: cluster.h:34
DISTRIBUTION * Distrib
Definition: cluster.h:77
void memfree(void *element)
Definition: freelist.cpp:30
unsigned Style
Definition: cluster.h:74
FLOAT32 * Elliptical
Definition: cluster.h:64
#define NULL
Definition: host.h:144
FLOATUNION Variance
Definition: cluster.h:81
#define FALSE
Definition: capi.h:28
FLOATUNION Weight
Definition: cluster.h:83
CLUSTER * Cluster
Definition: cluster.h:76
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Mean
Definition: cluster.h:78
CLUSTERER* MakeClusterer ( inT16  SampleSize,
const PARAM_DESC  ParamDesc[] 
)

MakeClusterer ********************************************************** Parameters: SampleSize number of dimensions in feature space ParamDesc description of each dimension Operation: This routine creates a new clusterer data structure, initializes it, and returns a pointer to it. Return: pointer to the new clusterer data structure Exceptions: None History: 5/29/89, DSJ, Created.

Definition at line 395 of file cluster.cpp.

395  {
396  CLUSTERER *Clusterer;
397  int i;
398 
399  // allocate main clusterer data structure and init simple fields
400  Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
401  Clusterer->SampleSize = SampleSize;
402  Clusterer->NumberOfSamples = 0;
403  Clusterer->NumChar = 0;
404 
405  // init fields which will not be used initially
406  Clusterer->Root = NULL;
407  Clusterer->ProtoList = NIL_LIST;
408 
409  // maintain a copy of param descriptors in the clusterer data structure
410  Clusterer->ParamDesc =
411  (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
412  for (i = 0; i < SampleSize; i++) {
413  Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
414  Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
415  Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
416  Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
417  Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
418  Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
419  Clusterer->ParamDesc[i].MidRange =
420  (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
421  }
422 
423  // allocate a kd tree to hold the samples
424  Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
425 
426  // Initialize cache of histogram buckets to minimize recomputing them.
427  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
428  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
429  Clusterer->bucket_cache[d][c] = NULL;
430  }
431 
432  return Clusterer;
433 } // MakeClusterer
PARAM_DESC * ParamDesc
Definition: cluster.h:88
FLOAT32 MidRange
Definition: ocrfeatures.h:52
#define NIL_LIST
Definition: oldlist.h:126
inT32 NumChar
Definition: cluster.h:93
inT8 Circular
Definition: ocrfeatures.h:46
FLOAT32 HalfRange
Definition: ocrfeatures.h:51
#define NULL
Definition: host.h:144
FLOAT32 Range
Definition: ocrfeatures.h:50
#define MAXBUCKETS
Definition: cluster.h:27
KDTREE * MakeKDTree(inT16 KeySize, const PARAM_DESC KeyDesc[])
Definition: kdtree.cpp:184
inT16 SampleSize
Definition: cluster.h:87
CLUSTER * Root
Definition: cluster.h:91
FLOAT32 Min
Definition: ocrfeatures.h:48
FLOAT32 Max
Definition: ocrfeatures.h:49
#define MINBUCKETS
Definition: cluster.h:26
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
LIST ProtoList
Definition: cluster.h:92
inT32 NumberOfSamples
Definition: cluster.h:89
inT8 NonEssential
Definition: ocrfeatures.h:47
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1-MINBUCKETS]
Definition: cluster.h:95
KDTREE * KDTree
Definition: cluster.h:90
SAMPLE* MakeSample ( CLUSTERER Clusterer,
const FLOAT32 Feature,
inT32  CharID 
)

MakeSample *********************************************************** Parameters: Clusterer clusterer data structure to add sample to Feature feature to be added to clusterer CharID unique ident. of char that sample came from Operation: This routine creates a new sample data structure to hold the specified feature. This sample is added to the clusterer data structure (so that it knows which samples are to be clustered later), and a pointer to the sample is returned to the caller. Return: Pointer to the new sample data structure Exceptions: ALREADYCLUSTERED MakeSample can't be called after ClusterSamples has been called History: 5/29/89, DSJ, Created.

Definition at line 450 of file cluster.cpp.

451  {
452  SAMPLE *Sample;
453  int i;
454 
455  // see if the samples have already been clustered - if so trap an error
456  if (Clusterer->Root != NULL)
458  "Can't add samples after they have been clustered");
459 
460  // allocate the new sample and initialize it
461  Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
462  (Clusterer->SampleSize -
463  1) * sizeof (FLOAT32));
464  Sample->Clustered = FALSE;
465  Sample->Prototype = FALSE;
466  Sample->SampleCount = 1;
467  Sample->Left = NULL;
468  Sample->Right = NULL;
469  Sample->CharID = CharID;
470 
471  for (i = 0; i < Clusterer->SampleSize; i++)
472  Sample->Mean[i] = Feature[i];
473 
474  // add the sample to the KD tree - keep track of the total # of samples
475  Clusterer->NumberOfSamples++;
476  KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
477  if (CharID >= Clusterer->NumChar)
478  Clusterer->NumChar = CharID + 1;
479 
480  // execute hook for monitoring clustering operation
481  // (*SampleCreationHook)( Sample );
482 
483  return (Sample);
484 } // MakeSample
unsigned Prototype
Definition: cluster.h:34
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data)
Definition: kdtree.cpp:209
unsigned SampleCount
Definition: cluster.h:35
Definition: cluster.h:32
inT32 NumChar
Definition: cluster.h:93
#define NULL
Definition: host.h:144
void DoError(int Error, const char *Message)
Definition: danerror.cpp:32
#define FALSE
Definition: capi.h:28
float FLOAT32
Definition: host.h:111
inT16 SampleSize
Definition: cluster.h:87
CLUSTER * Root
Definition: cluster.h:91
FLOAT32 Mean[1]
Definition: cluster.h:39
struct sample * Right
Definition: cluster.h:37
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
inT32 NumberOfSamples
Definition: cluster.h:89
unsigned Clustered
Definition: cluster.h:33
struct sample * Left
Definition: cluster.h:36
#define ALREADYCLUSTERED
Definition: cluster.h:133
KDTREE * KDTree
Definition: cluster.h:90
inT32 CharID
Definition: cluster.h:38
FLOAT32 Mean ( PROTOTYPE Proto,
uinT16  Dimension 
)

Mean *********************************************************** Parameters: Proto prototype to return mean of Dimension dimension whose mean is to be returned Operation: This routine returns the mean of the specified prototype in the indicated dimension. Return: Mean of Prototype in Dimension Exceptions: none History: 7/6/89, DSJ, Created.

Definition at line 639 of file cluster.cpp.

639  {
640  return (Proto->Mean[Dimension]);
641 } // Mean
FLOAT32 * Mean
Definition: cluster.h:78
inT32 MergeClusters ( inT16  N,
PARAM_DESC  ParamDesc[],
inT32  n1,
inT32  n2,
FLOAT32  m[],
FLOAT32  m1[],
FLOAT32  m2[] 
)

MergeClusters ************************************************************ Parameters: N # of dimensions (size of arrays) ParamDesc array of dimension descriptions n1, n2 number of samples in each old cluster m array to hold mean of new cluster m1, m2 arrays containing means of old clusters Operation: This routine merges two clusters into one larger cluster. To do this it computes the number of samples in the new cluster and the mean of the new cluster. The ParamDesc information is used to ensure that circular dimensions are handled correctly. Return: The number of samples in the new cluster. Exceptions: None History: 5/31/89, DSJ, Created.

Definition at line 882 of file cluster.cpp.

887  {
888  inT32 i, n;
889 
890  n = n1 + n2;
891  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
892  if (ParamDesc->Circular) {
893  // if distance between means is greater than allowed
894  // reduce upper point by one "rotation" to compute mean
895  // then normalize the mean back into the accepted range
896  if ((*m2 - *m1) > ParamDesc->HalfRange) {
897  *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
898  if (*m < ParamDesc->Min)
899  *m += ParamDesc->Range;
900  }
901  else if ((*m1 - *m2) > ParamDesc->HalfRange) {
902  *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
903  if (*m < ParamDesc->Min)
904  *m += ParamDesc->Range;
905  }
906  else
907  *m = (n1 * *m1 + n2 * *m2) / n;
908  }
909  else
910  *m = (n1 * *m1 + n2 * *m2) / n;
911  }
912  return n;
913 } // MergeClusters
int inT32
Definition: host.h:102
CLUSTER* NextSample ( LIST SearchState)

NextSample ************************************************************ Parameters: SearchState ptr to list containing clusters to be searched Operation: This routine is used to find all of the samples which belong to a cluster. It starts by removing the top cluster on the cluster list (SearchState). If this cluster is a leaf it is returned. Otherwise, the right subcluster is pushed on the list and we continue the search in the left subcluster. This continues until a leaf is found. If all samples have been found, NULL is returned. InitSampleSearch() must be called before NextSample() to initialize the search. Return: Pointer to the next leaf cluster (sample) or NULL. Exceptions: None History: 6/16/89, DSJ, Created.

Definition at line 614 of file cluster.cpp.

614  {
615  CLUSTER *Cluster;
616 
617  if (*SearchState == NIL_LIST)
618  return (NULL);
619  Cluster = (CLUSTER *) first_node (*SearchState);
620  *SearchState = pop (*SearchState);
621  while (TRUE) {
622  if (Cluster->Left == NULL)
623  return (Cluster);
624  *SearchState = push (*SearchState, Cluster->Right);
625  Cluster = Cluster->Left;
626  }
627 } // NextSample
LIST pop(LIST list)
Definition: oldlist.cpp:305
Definition: cluster.h:32
#define NIL_LIST
Definition: oldlist.h:126
#define NULL
Definition: host.h:144
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
struct sample * Right
Definition: cluster.h:37
struct sample * Left
Definition: cluster.h:36
#define first_node(l)
Definition: oldlist.h:139
#define TRUE
Definition: capi.h:27
FLOAT32 StandardDeviation ( PROTOTYPE Proto,
uinT16  Dimension 
)

StandardDeviation ************************************************* Parameters: Proto prototype to return standard deviation of Dimension dimension whose stddev is to be returned Operation: This routine returns the standard deviation of the prototype in the indicated dimension. Return: Standard deviation of Prototype in Dimension Exceptions: none History: 7/6/89, DSJ, Created.

Definition at line 653 of file cluster.cpp.

653  {
654  switch (Proto->Style) {
655  case spherical:
656  return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
657  case elliptical:
658  return ((FLOAT32)
659  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
660  case mixed:
661  switch (Proto->Distrib[Dimension]) {
662  case normal:
663  return ((FLOAT32)
664  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
665  case uniform:
666  case D_random:
667  return (Proto->Variance.Elliptical[Dimension]);
668  case DISTRIBUTION_COUNT:
669  ASSERT_HOST(!"Distribution count not allowed!");
670  }
671  }
672  return 0.0f;
673 } // StandardDeviation
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 Spherical
Definition: cluster.h:63
unsigned Style
Definition: cluster.h:74
FLOAT32 * Elliptical
Definition: cluster.h:64
FLOATUNION Variance
Definition: cluster.h:81
float FLOAT32
Definition: host.h:111
Definition: cluster.h:45
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: cluster.h:59