Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
commontraining.cpp File Reference
#include "commontraining.h"
#include "base/init_google.h"
#include "base/commandlineflags.h"
#include "allheaders.h"
#include "ccutil.h"
#include "classify.h"
#include "oldlist.h"
#include "globals.h"
#include "mf.h"
#include "clusttool.h"
#include "cluster.h"
#include "tessopt.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "fontinfo.h"
#include "intfeaturespace.h"
#include "mastertrainer.h"
#include "tessdatamanager.h"
#include "tprintf.h"
#include "freelist.h"
#include "params.h"
#include "shapetable.h"
#include "unicity_table.h"
#include <math.h>

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 INT_PARAM_FLAG (debug_level, 0,"Level of Trainer debugging")
 
 INT_PARAM_FLAG (load_images, 0,"Load images with tr files")
 
 STRING_PARAM_FLAG (configfile,"","File to load more configs from")
 
 STRING_PARAM_FLAG (D,"","Directory to write output files to")
 
 STRING_PARAM_FLAG (F,"font_properties","File listing font properties")
 
 STRING_PARAM_FLAG (X,"","File listing font xheights")
 
 STRING_PARAM_FLAG (U,"unicharset","File to load unicharset from")
 
 STRING_PARAM_FLAG (O,"","File to write unicharset to")
 
 STRING_PARAM_FLAG (input_trainer,"","File to load trainer from")
 
 STRING_PARAM_FLAG (output_trainer,"","File to write trainer to")
 
 STRING_PARAM_FLAG (test_ch,"","UTF8 test character string")
 
void ParseArguments (int *argc, char ***argv)
 
ShapeTabletesseract::LoadShapeTable (const STRING &file_prefix)
 
void tesseract::WriteShapeTable (const STRING &file_prefix, const ShapeTable &shape_table)
 
MasterTrainer * tesseract::LoadTrainingData (int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
 
const char * GetNextFilename (int argc, const char *const *argv)
 
LABELEDLIST FindList (LIST List, char *Label)
 
LABELEDLIST NewLabeledList (const char *Label)
 
void ReadTrainingSamples (const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
 
void FreeTrainingSamples (LIST CharList)
 
void FreeLabeledList (LABELEDLIST LabeledList)
 
CLUSTERERSetUpForClustering (const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
 
void MergeInsignificantProtos (LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void CleanUpUnusedData (LIST ProtoList)
 
LIST RemoveInsignificantProtos (LIST ProtoList, BOOL8 KeepSigProtos, BOOL8 KeepInsigProtos, int N)
 
MERGE_CLASS FindClass (LIST List, const char *Label)
 
MERGE_CLASS NewLabeledClass (const char *Label)
 
void FreeLabeledClassList (LIST ClassList)
 
CLASS_STRUCTSetUpForFloat2Int (const UNICHARSET &unicharset, LIST LabeledClassList)
 
void Normalize (float *Values)
 
void FreeNormProtoList (LIST CharList)
 
void AddToNormProtosList (LIST *NormProtoList, LIST ProtoList, char *CharName)
 
int NumberOfProtos (LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
 

Variables

CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 }
 
const char * kUsage = "[flags] [ .tr files ... ]\n"
 
FEATURE_DEFS_STRUCT feature_defs
 
CCUtil ccutil
 

Function Documentation

void AddToNormProtosList ( LIST NormProtoList,
LIST  ProtoList,
char *  CharName 
)

Definition at line 935 of file commontraining.cpp.

939 {
940  PROTOTYPE* Proto;
941  LABELEDLIST LabeledProtoList;
942 
943  LabeledProtoList = NewLabeledList(CharName);
944  iterate(ProtoList)
945  {
946  Proto = (PROTOTYPE *) first_node (ProtoList);
947  LabeledProtoList->List = push(LabeledProtoList->List, Proto);
948  }
949  *NormProtoList = push(*NormProtoList, LabeledProtoList);
950 }
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
LABELEDLIST NewLabeledList(const char *Label)
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
void CleanUpUnusedData ( LIST  ProtoList)

Definition at line 683 of file commontraining.cpp.

685 {
686  PROTOTYPE* Prototype;
687 
688  iterate(ProtoList)
689  {
690  Prototype = (PROTOTYPE *) first_node (ProtoList);
691  if(Prototype->Variance.Elliptical != NULL)
692  {
693  memfree(Prototype->Variance.Elliptical);
694  Prototype->Variance.Elliptical = NULL;
695  }
696  if(Prototype->Magnitude.Elliptical != NULL)
697  {
698  memfree(Prototype->Magnitude.Elliptical);
699  Prototype->Magnitude.Elliptical = NULL;
700  }
701  if(Prototype->Weight.Elliptical != NULL)
702  {
703  memfree(Prototype->Weight.Elliptical);
704  Prototype->Weight.Elliptical = NULL;
705  }
706  }
707 }
void memfree(void *element)
Definition: freelist.cpp:30
FLOAT32 * Elliptical
Definition: cluster.h:64
#define NULL
Definition: host.h:144
FLOATUNION Variance
Definition: cluster.h:81
FLOATUNION Weight
Definition: cluster.h:83
FLOATUNION Magnitude
Definition: cluster.h:82
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
MERGE_CLASS FindClass ( LIST  List,
const char *  Label 
)

Definition at line 778 of file commontraining.cpp.

781 {
782  MERGE_CLASS MergeClass;
783 
784  iterate (List)
785  {
786  MergeClass = (MERGE_CLASS) first_node (List);
787  if (strcmp (MergeClass->Label, Label) == 0)
788  return (MergeClass);
789  }
790  return (NULL);
791 
792 } /* FindClass */
#define NULL
Definition: host.h:144
MERGE_CLASS_NODE * MERGE_CLASS
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
LABELEDLIST FindList ( LIST  List,
char *  Label 
)

Definition at line 385 of file commontraining.cpp.

403 {
404  LABELEDLIST LabeledList;
405 
406  iterate (List)
407  {
408  LabeledList = (LABELEDLIST) first_node (List);
409  if (strcmp (LabeledList->Label, Label) == 0)
410  return (LabeledList);
411  }
412  return (NULL);
413 
414 } /* FindList */
struct LABELEDLISTNODE * LABELEDLIST
#define NULL
Definition: host.h:144
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
void FreeLabeledClassList ( LIST  ClassList)

Definition at line 809 of file commontraining.cpp.

824 {
825  MERGE_CLASS MergeClass;
826 
827  iterate (ClassList) /* iterate thru all of the fonts */
828  {
829  MergeClass = (MERGE_CLASS) first_node (ClassList);
830  free (MergeClass->Label);
831  FreeClass(MergeClass->Class);
832  delete MergeClass;
833  }
834  destroy (ClassList);
835 
836 } /* FreeLabeledClassList */
LIST destroy(LIST list)
Definition: oldlist.cpp:187
void FreeClass(CLASS_TYPE Class)
Definition: protos.cpp:215
MERGE_CLASS_NODE * MERGE_CLASS
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
CLASS_TYPE Class
void FreeLabeledList ( LABELEDLIST  LabeledList)

Definition at line 548 of file commontraining.cpp.

548  {
549 /*
550  ** Parameters:
551  ** LabeledList labeled list to be freed
552  ** Globals: none
553  ** Operation:
554  ** This routine deallocates all of the memory consumed by
555  ** a labeled list. It does not free any memory which may be
556  ** consumed by the items in the list.
557  ** Return: none
558  ** Exceptions: none
559  ** History: Fri Aug 18 17:52:45 1989, DSJ, Created.
560  */
561  destroy(LabeledList->List);
562  free(LabeledList->Label);
563  free(LabeledList);
564 } /* FreeLabeledList */
LIST destroy(LIST list)
Definition: oldlist.cpp:187
void FreeNormProtoList ( LIST  CharList)

Definition at line 919 of file commontraining.cpp.

922 {
923  LABELEDLIST char_sample;
924 
925  iterate (CharList) /* iterate thru all of the fonts */
926  {
927  char_sample = (LABELEDLIST) first_node (CharList);
928  FreeLabeledList (char_sample);
929  }
930  destroy (CharList);
931 
932 } // FreeNormProtoList
struct LABELEDLISTNODE * LABELEDLIST
LIST destroy(LIST list)
Definition: oldlist.cpp:187
void FreeLabeledList(LABELEDLIST LabeledList)
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
void FreeTrainingSamples ( LIST  CharList)

Definition at line 518 of file commontraining.cpp.

518  {
519 /*
520  ** Parameters:
521  ** FontList list of all fonts in document
522  ** Globals: none
523  ** Operation:
524  ** This routine deallocates all of the space allocated to
525  ** the specified list of training samples.
526  ** Return: none
527  ** Exceptions: none
528  ** History: Fri Aug 18 17:44:27 1989, DSJ, Created.
529  */
530  LABELEDLIST char_sample;
531  FEATURE_SET FeatureSet;
532  LIST FeatureList;
533 
534 
535  iterate(CharList) { /* iterate thru all of the fonts */
536  char_sample = (LABELEDLIST) first_node(CharList);
537  FeatureList = char_sample->List;
538  iterate(FeatureList) { /* iterate thru all of the classes */
539  FeatureSet = (FEATURE_SET) first_node(FeatureList);
540  FreeFeatureSet(FeatureSet);
541  }
542  FreeLabeledList(char_sample);
543  }
544  destroy(CharList);
545 } /* FreeTrainingSamples */
FEATURE_SET_STRUCT * FEATURE_SET
Definition: ocrfeatures.h:73
struct LABELEDLISTNODE * LABELEDLIST
LIST destroy(LIST list)
Definition: oldlist.cpp:187
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:79
void FreeLabeledList(LABELEDLIST LabeledList)
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
const char* GetNextFilename ( int  argc,
const char *const *  argv 
)

Definition at line 362 of file commontraining.cpp.

362  {
363  /*
364  ** Parameters: none
365  ** Globals:
366  ** tessoptind defined by tessopt sys call
367  ** Operation:
368  ** This routine returns the next command line argument. If
369  ** there are no remaining command line arguments, it returns
370  ** NULL. This routine should only be called after all option
371  ** arguments have been parsed and removed with ParseArguments.
372  ** Return: Next command line argument or NULL.
373  ** Exceptions: none
374  ** History: Fri Aug 18 09:34:12 1989, DSJ, Created.
375  */
376  if (tessoptind < argc)
377  return argv[tessoptind++];
378  else
379  return NULL;
380 } /* GetNextFilename */
int tessoptind
Definition: tessopt.cpp:26
#define NULL
Definition: host.h:144
INT_PARAM_FLAG ( debug_level  ,
,
"Level of Trainer debugging"   
)
INT_PARAM_FLAG ( load_images  ,
,
"Load images with tr files"   
)
void MergeInsignificantProtos ( LIST  ProtoList,
const char *  label,
CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

Definition at line 618 of file commontraining.cpp.

619  {
620  PROTOTYPE *Prototype;
621  bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
622 
623  LIST pProtoList = ProtoList;
624  iterate(pProtoList) {
625  Prototype = (PROTOTYPE *) first_node (pProtoList);
626  if (Prototype->Significant || Prototype->Merged)
627  continue;
628  FLOAT32 best_dist = 0.125;
629  PROTOTYPE* best_match = NULL;
630  // Find the nearest alive prototype.
631  LIST list_it = ProtoList;
632  iterate(list_it) {
633  PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
634  if (test_p != Prototype && !test_p->Merged) {
635  FLOAT32 dist = ComputeDistance(Clusterer->SampleSize,
636  Clusterer->ParamDesc,
637  Prototype->Mean, test_p->Mean);
638  if (dist < best_dist) {
639  best_match = test_p;
640  best_dist = dist;
641  }
642  }
643  }
644  if (best_match != NULL && !best_match->Significant) {
645  if (debug)
646  tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
647  best_match->NumSamples, Prototype->NumSamples,
648  best_match->Mean[0], best_match->Mean[1],
649  Prototype->Mean[0], Prototype->Mean[1]);
650  best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
651  Clusterer->ParamDesc,
652  best_match->NumSamples,
653  Prototype->NumSamples,
654  best_match->Mean,
655  best_match->Mean, Prototype->Mean);
656  Prototype->NumSamples = 0;
657  Prototype->Merged = 1;
658  } else if (best_match != NULL) {
659  if (debug)
660  tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
661  Prototype->Mean[0], Prototype->Mean[1],
662  best_match->Mean[0], best_match->Mean[1]);
663  Prototype->Merged = 1;
664  }
665  }
666  // Mark significant those that now have enough samples.
667  int min_samples = (inT32) (Config->MinSamples * Clusterer->NumChar);
668  pProtoList = ProtoList;
669  iterate(pProtoList) {
670  Prototype = (PROTOTYPE *) first_node (pProtoList);
671  // Process insignificant protos that do not match a green one
672  if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
673  !Prototype->Merged) {
674  if (debug)
675  tprintf("Red proto at %g,%g becoming green\n",
676  Prototype->Mean[0], Prototype->Mean[1]);
677  Prototype->Significant = true;
678  }
679  }
680 } /* MergeInsignificantProtos */
unsigned Merged
Definition: cluster.h:69
unsigned NumSamples
Definition: cluster.h:75
PARAM_DESC * ParamDesc
Definition: cluster.h:88
inT32 NumChar
Definition: cluster.h:93
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
float FLOAT32
Definition: host.h:111
inT16 SampleSize
Definition: cluster.h:87
FLOAT32 ComputeDistance(int k, PARAM_DESC *dim, FLOAT32 p1[], FLOAT32 p2[])
Definition: kdtree.cpp:486
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
unsigned Significant
Definition: cluster.h:68
FLOAT32 MinSamples
Definition: cluster.h:50
FLOAT32 * Mean
Definition: cluster.h:78
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
MERGE_CLASS NewLabeledClass ( const char *  Label)

Definition at line 795 of file commontraining.cpp.

797 {
798  MERGE_CLASS MergeClass;
799 
800  MergeClass = new MERGE_CLASS_NODE;
801  MergeClass->Label = (char*)Emalloc (strlen (Label)+1);
802  strcpy (MergeClass->Label, Label);
803  MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
804  return (MergeClass);
805 
806 } /* NewLabeledClass */
#define MAX_NUM_CONFIGS
Definition: intproto.h:44
#define MAX_NUM_PROTOS
Definition: intproto.h:45
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
Definition: protos.cpp:248
CLASS_TYPE Class
LABELEDLIST NewLabeledList ( const char *  Label)

Definition at line 417 of file commontraining.cpp.

432 {
433  LABELEDLIST LabeledList;
434 
435  LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE));
436  LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
437  strcpy (LabeledList->Label, Label);
438  LabeledList->List = NIL_LIST;
439  LabeledList->SampleCount = 0;
440  LabeledList->font_sample_count = 0;
441  return (LabeledList);
442 
443 } /* NewLabeledList */
#define NIL_LIST
Definition: oldlist.h:126
struct LABELEDLISTNODE * LABELEDLIST
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
void Normalize ( float *  Values)

Definition at line 902 of file commontraining.cpp.

904 {
905  register float Slope;
906  register float Intercept;
907  register float Normalizer;
908 
909  Slope = tan (Values [2] * 2 * PI);
910  Intercept = Values [1] - Slope * Values [0];
911  Normalizer = 1 / sqrt (Slope * Slope + 1.0);
912 
913  Values [0] = Slope * Normalizer;
914  Values [1] = - Normalizer;
915  Values [2] = Intercept * Normalizer;
916 } // Normalize
#define PI
Definition: const.h:19
int NumberOfProtos ( LIST  ProtoList,
BOOL8  CountSigProtos,
BOOL8  CountInsigProtos 
)

Definition at line 953 of file commontraining.cpp.

957 {
958  int N = 0;
959  PROTOTYPE *Proto;
960 
961  iterate(ProtoList)
962  {
963  Proto = (PROTOTYPE *) first_node ( ProtoList );
964  if (( Proto->Significant && CountSigProtos ) ||
965  ( ! Proto->Significant && CountInsigProtos ) )
966  N++;
967  }
968  return(N);
969 }
unsigned Significant
Definition: cluster.h:68
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
void ParseArguments ( int *  argc,
char ***  argv 
)

Definition at line 88 of file commontraining.cpp.

88  {
89 /*
90  ** Parameters:
91  ** argc number of command line arguments to parse
92  ** argv command line arguments
93  ** Globals:
94  ** ShowSignificantProtos flag controlling proto display
95  ** ShowInsignificantProtos flag controlling proto display
96  ** Config current clustering parameters
97  ** tessoptarg, tessoptind defined by tessopt sys call
98  ** Argc, Argv global copies of argc and argv
99  ** Operation:
100  ** This routine parses the command line arguments that were
101  ** passed to the program. The legal arguments are shown in the usage
102  ** message below:
103 
104  ** Return: none
105  ** Exceptions: Illegal options terminate the program.
106  ** History: 7/24/89, DSJ, Created.
107  */
108 #ifndef USE_STD_NAMESPACE
109  InitGoogle(kUsage, argc, argv, true);
110  tessoptind = 1;
111 #else
112  int Option;
113  int ParametersRead;
114  BOOL8 Error;
115 
116  Error = FALSE;
117  while ((Option = tessopt(*argc, *argv, "F:O:U:D:C:I:M:B:S:X:c:")) != EOF) {
118  switch (Option) {
119  case 'C':
120  ParametersRead = sscanf(tessoptarg, "%lf", &(Config.Confidence) );
121  if ( ParametersRead != 1 ) Error = TRUE;
122  else if ( Config.Confidence > 1 ) Config.Confidence = 1;
123  else if ( Config.Confidence < 0 ) Config.Confidence = 0;
124  break;
125  case 'I':
126  ParametersRead = sscanf(tessoptarg, "%f", &(Config.Independence) );
127  if ( ParametersRead != 1 ) Error = TRUE;
128  else if ( Config.Independence > 1 ) Config.Independence = 1;
129  else if ( Config.Independence < 0 ) Config.Independence = 0;
130  break;
131  case 'M':
132  ParametersRead = sscanf(tessoptarg, "%f", &(Config.MinSamples) );
133  if ( ParametersRead != 1 ) Error = TRUE;
134  else if ( Config.MinSamples > 1 ) Config.MinSamples = 1;
135  else if ( Config.MinSamples < 0 ) Config.MinSamples = 0;
136  break;
137  case 'B':
138  ParametersRead = sscanf(tessoptarg, "%f", &(Config.MaxIllegal) );
139  if ( ParametersRead != 1 ) Error = TRUE;
140  else if ( Config.MaxIllegal > 1 ) Config.MaxIllegal = 1;
141  else if ( Config.MaxIllegal < 0 ) Config.MaxIllegal = 0;
142  break;
143  case 'c':
144  FLAGS_configfile.set_value(tessoptarg);
145  break;
146  case 'D':
147  FLAGS_D.set_value(tessoptarg);
148  break;
149  case 'U':
150  FLAGS_U.set_value(tessoptarg);
151  break;
152  case 'O':
153  FLAGS_O.set_value(tessoptarg);
154  break;
155  case 'F':
156  FLAGS_F.set_value(tessoptarg);
157  break;
158  case 'X':
159  FLAGS_X.set_value(tessoptarg);
160  break;
161  case '?':
162  Error = TRUE;
163  break;
164  }
165  if (Error) {
166  fprintf(stderr, "Usage: %s %s\n", (*argv)[0], kUsage);
167  exit(2);
168  }
169  }
170 #endif
171  // Set additional parameters from config file if specified.
172  if (!FLAGS_configfile.empty()) {
174  FLAGS_configfile.c_str(),
176  ccutil.params());
177  }
178 } // ParseArguments
FLOAT64 Confidence
Definition: cluster.h:54
FLOAT32 Independence
Definition: cluster.h:53
int tessoptind
Definition: tessopt.cpp:26
unsigned char BOOL8
Definition: host.h:113
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:43
const char * kUsage
#define FALSE
Definition: capi.h:28
CLUSTERCONFIG Config
CCUtil ccutil
char * tessoptarg
Definition: tessopt.cpp:27
FLOAT32 MaxIllegal
Definition: cluster.h:51
int tessopt(inT32 argc, char *argv[], const char *arglist)
Definition: tessopt.cpp:35
FLOAT32 MinSamples
Definition: cluster.h:50
ParamsVectors * params()
Definition: ccutil.h:65
#define TRUE
Definition: capi.h:27
void ReadTrainingSamples ( const FEATURE_DEFS_STRUCT feature_defs,
const char *  feature_name,
int  max_samples,
UNICHARSET unicharset,
FILE *  file,
LIST training_samples 
)

Definition at line 448 of file commontraining.cpp.

451  {
452 /*
453 ** Parameters:
454 ** file open text file to read samples from
455 ** Globals: none
456 ** Operation:
457 ** This routine reads training samples from a file and
458 ** places them into a data structure which organizes the
459 ** samples by FontName and CharName. It then returns this
460 ** data structure.
461 ** Return: none
462 ** Exceptions: none
463 ** History: Fri Aug 18 13:11:39 1989, DSJ, Created.
464 ** Tue May 17 1998 simplifications to structure, illiminated
465 ** font, and feature specification levels of structure.
466 */
467  char buffer[2048];
468  char unichar[UNICHAR_LEN + 1];
469  LABELEDLIST char_sample;
470  FEATURE_SET feature_samples;
471  CHAR_DESC char_desc;
472  int i;
473  int feature_type = ShortNameToFeatureType(feature_defs, feature_name);
474  // Zero out the font_sample_count for all the classes.
475  LIST it = *training_samples;
476  iterate(it) {
477  char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
478  char_sample->font_sample_count = 0;
479  }
480 
481  while (fgets(buffer, 2048, file) != NULL) {
482  if (buffer[0] == '\n')
483  continue;
484 
485  sscanf(buffer, "%*s %s", unichar);
486  if (unicharset != NULL && !unicharset->contains_unichar(unichar)) {
487  unicharset->unichar_insert(unichar);
488  if (unicharset->size() > MAX_NUM_CLASSES) {
489  tprintf("Error: Size of unicharset in training is "
490  "greater than MAX_NUM_CLASSES\n");
491  exit(1);
492  }
493  }
494  char_sample = FindList(*training_samples, unichar);
495  if (char_sample == NULL) {
496  char_sample = NewLabeledList(unichar);
497  *training_samples = push(*training_samples, char_sample);
498  }
499  char_desc = ReadCharDescription(feature_defs, file);
500  feature_samples = char_desc->FeatureSets[feature_type];
501  if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
502  char_sample->List = push(char_sample->List, feature_samples);
503  char_sample->SampleCount++;
504  char_sample->font_sample_count++;
505  } else {
506  FreeFeatureSet(feature_samples);
507  }
508  for (i = 0; i < char_desc->NumFeatureSets; i++) {
509  if (feature_type != i)
510  FreeFeatureSet(char_desc->FeatureSets[i]);
511  }
512  free(char_desc);
513  }
514 } // ReadTrainingSamples
int size() const
Definition: unicharset.h:264
#define NULL
Definition: host.h:144
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:79
LABELEDLIST NewLabeledList(const char *Label)
LABELEDLIST FindList(LIST List, char *Label)
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:511
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:261
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:44
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:543
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
#define UNICHAR_LEN
Definition: unichar.h:28
uinT32 NumFeatureSets
Definition: featdefs.h:43
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:300
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
LIST RemoveInsignificantProtos ( LIST  ProtoList,
BOOL8  KeepSigProtos,
BOOL8  KeepInsigProtos,
int  N 
)

Definition at line 710 of file commontraining.cpp.

716 {
717  LIST NewProtoList = NIL_LIST;
718  LIST pProtoList;
719  PROTOTYPE* Proto;
720  PROTOTYPE* NewProto;
721  int i;
722 
723  pProtoList = ProtoList;
724  iterate(pProtoList)
725  {
726  Proto = (PROTOTYPE *) first_node (pProtoList);
727  if ((Proto->Significant && KeepSigProtos) ||
728  (!Proto->Significant && KeepInsigProtos))
729  {
730  NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
731 
732  NewProto->Mean = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
733  NewProto->Significant = Proto->Significant;
734  NewProto->Style = Proto->Style;
735  NewProto->NumSamples = Proto->NumSamples;
736  NewProto->Cluster = NULL;
737  NewProto->Distrib = NULL;
738 
739  for (i=0; i < N; i++)
740  NewProto->Mean[i] = Proto->Mean[i];
741  if (Proto->Variance.Elliptical != NULL)
742  {
743  NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
744  for (i=0; i < N; i++)
745  NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
746  }
747  else
748  NewProto->Variance.Elliptical = NULL;
749  //---------------------------------------------
750  if (Proto->Magnitude.Elliptical != NULL)
751  {
752  NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
753  for (i=0; i < N; i++)
754  NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
755  }
756  else
757  NewProto->Magnitude.Elliptical = NULL;
758  //------------------------------------------------
759  if (Proto->Weight.Elliptical != NULL)
760  {
761  NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
762  for (i=0; i < N; i++)
763  NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
764  }
765  else
766  NewProto->Weight.Elliptical = NULL;
767 
768  NewProto->TotalMagnitude = Proto->TotalMagnitude;
769  NewProto->LogMagnitude = Proto->LogMagnitude;
770  NewProtoList = push_last(NewProtoList, NewProto);
771  }
772  }
773  FreeProtoList(&ProtoList);
774  return (NewProtoList);
775 } /* RemoveInsignificantProtos */
FLOAT32 TotalMagnitude
Definition: cluster.h:79
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:338
DISTRIBUTION * Distrib
Definition: cluster.h:77
unsigned NumSamples
Definition: cluster.h:75
#define NIL_LIST
Definition: oldlist.h:126
unsigned Style
Definition: cluster.h:74
FLOAT32 * Elliptical
Definition: cluster.h:64
#define NULL
Definition: host.h:144
FLOATUNION Variance
Definition: cluster.h:81
float FLOAT32
Definition: host.h:111
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Weight
Definition: cluster.h:83
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
unsigned Significant
Definition: cluster.h:68
CLUSTER * Cluster
Definition: cluster.h:76
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:560
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Mean
Definition: cluster.h:78
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
CLUSTERER* SetUpForClustering ( const FEATURE_DEFS_STRUCT FeatureDefs,
LABELEDLIST  char_sample,
const char *  program_feature_type 
)

Definition at line 567 of file commontraining.cpp.

569  {
570 /*
571  ** Parameters:
572  ** char_sample: LABELEDLIST that holds all the feature information for a
573  ** given character.
574  ** Globals:
575  ** None
576  ** Operation:
577  ** This routine reads samples from a LABELEDLIST and enters
578  ** those samples into a clusterer data structure. This
579  ** data structure is then returned to the caller.
580  ** Return:
581  ** Pointer to new clusterer data structure.
582  ** Exceptions:
583  ** None
584  ** History:
585  ** 8/16/89, DSJ, Created.
586  */
587  uinT16 N;
588  int i, j;
589  FLOAT32 *Sample = NULL;
590  CLUSTERER *Clusterer;
591  inT32 CharID;
592  LIST FeatureList = NULL;
593  FEATURE_SET FeatureSet = NULL;
594 
595  int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
596  N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
597  Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
598 
599  FeatureList = char_sample->List;
600  CharID = 0;
601  iterate(FeatureList) {
602  FeatureSet = (FEATURE_SET) first_node(FeatureList);
603  for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
604  if (Sample == NULL)
605  Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
606  for (j = 0; j < N; j++)
607  Sample[j] = FeatureSet->Features[i]->Params[j];
608  MakeSample (Clusterer, Sample, CharID);
609  }
610  CharID++;
611  }
612  if ( Sample != NULL ) free( Sample );
613  return( Clusterer );
614 
615 } /* SetUpForClustering */
FEATURE_SET_STRUCT * FEATURE_SET
Definition: ocrfeatures.h:73
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
Definition: cluster.cpp:450
float FLOAT32
Definition: host.h:111
FEATURE Features[1]
Definition: ocrfeatures.h:71
FLOAT32 Params[1]
Definition: ocrfeatures.h:64
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
unsigned short uinT16
Definition: host.h:101
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:50
CLUSTERER * MakeClusterer(inT16 SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:395
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:58
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:300
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
CLASS_STRUCT* SetUpForFloat2Int ( const UNICHARSET unicharset,
LIST  LabeledClassList 
)

SetUpForFloat2Int

Definition at line 839 of file commontraining.cpp.

840  {
841  MERGE_CLASS MergeClass;
842  CLASS_TYPE Class;
843  int NumProtos;
844  int NumConfigs;
845  int NumWords;
846  int i, j;
847  float Values[3];
848  PROTO NewProto;
849  PROTO OldProto;
850  BIT_VECTOR NewConfig;
851  BIT_VECTOR OldConfig;
852 
853  // printf("Float2Int ...\n");
854 
855  CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
856  iterate(LabeledClassList)
857  {
858  UnicityTableEqEq<int> font_set;
859  MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
860  Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
861  NumProtos = MergeClass->Class->NumProtos;
862  NumConfigs = MergeClass->Class->NumConfigs;
863  font_set.move(&MergeClass->Class->font_set);
864  Class->NumProtos = NumProtos;
865  Class->MaxNumProtos = NumProtos;
866  Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos);
867  for(i=0; i < NumProtos; i++)
868  {
869  NewProto = ProtoIn(Class, i);
870  OldProto = ProtoIn(MergeClass->Class, i);
871  Values[0] = OldProto->X;
872  Values[1] = OldProto->Y;
873  Values[2] = OldProto->Angle;
874  Normalize(Values);
875  NewProto->X = OldProto->X;
876  NewProto->Y = OldProto->Y;
877  NewProto->Length = OldProto->Length;
878  NewProto->Angle = OldProto->Angle;
879  NewProto->A = Values[0];
880  NewProto->B = Values[1];
881  NewProto->C = Values[2];
882  }
883 
884  Class->NumConfigs = NumConfigs;
885  Class->MaxNumConfigs = NumConfigs;
886  Class->font_set.move(&font_set);
887  Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs);
888  NumWords = WordsInVectorOfSize(NumProtos);
889  for(i=0; i < NumConfigs; i++)
890  {
891  NewConfig = NewBitVector(NumProtos);
892  OldConfig = MergeClass->Class->Configurations[i];
893  for(j=0; j < NumWords; j++)
894  NewConfig[j] = OldConfig[j];
895  Class->Configurations[i] = NewConfig;
896  }
897  }
898  return float_classes;
899 } // SetUpForFloat2Int
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
int size() const
Definition: unicharset.h:264
FLOAT32 B
Definition: protos.h:45
FLOAT32 Length
Definition: protos.h:50
#define WordsInVectorOfSize(NumBits)
Definition: bitvec.h:63
FLOAT32 Y
Definition: protos.h:48
inT16 NumProtos
Definition: protos.h:59
CONFIGS Configurations
Definition: protos.h:64
#define ProtoIn(Class, Pid)
Definition: protos.h:123
FLOAT32 A
Definition: protos.h:44
UnicityTableEqEq< int > font_set
Definition: protos.h:65
inT16 NumConfigs
Definition: protos.h:62
void move(UnicityTable< T > *from)
FLOAT32 Angle
Definition: protos.h:49
uinT32 * BIT_VECTOR
Definition: bitvec.h:28
void * Emalloc(size_t Size)
Definition: emalloc.cpp:35
FLOAT32 X
Definition: protos.h:47
PROTO_STRUCT * PROTO
Definition: protos.h:52
BIT_VECTOR NewBitVector(int NumBits)
Definition: bitvec.cpp:111
void Normalize(float *Values)
inT16 MaxNumProtos
Definition: protos.h:60
inT16 MaxNumConfigs
Definition: protos.h:63
MERGE_CLASS_NODE * MERGE_CLASS
PROTO Prototypes
Definition: protos.h:61
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
CLASS_TYPE Class
FLOAT32 C
Definition: protos.h:46
STRING_PARAM_FLAG ( configfile  ,
""  ,
"File to load more configs from"   
)
STRING_PARAM_FLAG ( ,
""  ,
"Directory to write output files to"   
)
STRING_PARAM_FLAG ( ,
"font_properties"  ,
"File listing font properties"   
)
STRING_PARAM_FLAG ( ,
""  ,
"File listing font xheights"   
)
STRING_PARAM_FLAG ( ,
"unicharset"  ,
"File to load unicharset from"   
)
STRING_PARAM_FLAG ( ,
""  ,
"File to write unicharset to"   
)
STRING_PARAM_FLAG ( input_trainer  ,
""  ,
"File to load trainer from"   
)
STRING_PARAM_FLAG ( output_trainer  ,
""  ,
"File to write trainer to"   
)
STRING_PARAM_FLAG ( test_ch  ,
""  ,
"UTF8 test character string"   
)

Variable Documentation

CCUtil ccutil

Definition at line 85 of file commontraining.cpp.

CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 }

Definition at line 53 of file commontraining.cpp.

FEATURE_DEFS_STRUCT feature_defs

Definition at line 84 of file commontraining.cpp.

const char* kUsage = "[flags] [ .tr files ... ]\n"

Definition at line 71 of file commontraining.cpp.