Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
classify.h
Go to the documentation of this file.
1 // File: classify.h
3 // Description: classify class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_CLASSIFY_CLASSIFY_H__
20 #define TESSERACT_CLASSIFY_CLASSIFY_H__
21 
22 #include "adaptive.h"
23 #include "ccstruct.h"
24 #include "classify.h"
25 #include "dict.h"
26 #include "featdefs.h"
27 #include "fontinfo.h"
28 #include "intfx.h"
29 #include "intmatcher.h"
30 #include "normalis.h"
31 #include "ratngs.h"
32 #include "ocrfeatures.h"
33 #include "unicity_table.h"
34 
35 class ScrollView;
36 class WERD_CHOICE;
37 class WERD_RES;
38 struct ADAPT_RESULTS;
39 struct NORM_PROTOS;
40 
41 static const int kUnknownFontinfoId = -1;
42 static const int kBlankFontinfoId = -2;
43 
44 namespace tesseract {
45 
46 struct ShapeRating;
47 class ShapeTable;
48 
49 // How segmented is a blob. In this enum, character refers to a classifiable
50 // unit, but that is too long and character is usually easier to understand.
52  CST_FRAGMENT, // A partial character.
53  CST_WHOLE, // A correctly segmented character.
54  CST_IMPROPER, // More than one but less than 2 characters.
55  CST_NGRAM // Multiple characters.
56 };
57 
58 class Classify : public CCStruct {
59  public:
60  Classify();
61  virtual ~Classify();
63  return dict_;
64  }
65 
66  const ShapeTable* shape_table() const {
67  return shape_table_;
68  }
69 
70  /* adaptive.cpp ************************************************************/
71  ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
72  int GetFontinfoId(ADAPT_CLASS Class, uinT8 ConfigId);
73  // Runs the class pruner from int_templates on the given features, returning
74  // the number of classes output in results.
75  // int_templates Class pruner tables
76  // num_features Number of features in blob
77  // features Array of features
78  // normalization_factors (input) Array of int_templates->NumClasses fudge
79  // factors from blob normalization process.
80  // (Indexed by CLASS_INDEX)
81  // expected_num_features (input) Array of int_templates->NumClasses
82  // expected number of features for each class.
83  // (Indexed by CLASS_INDEX)
84  // results (output) Sorted Array of pruned classes.
85  // Array must be sized to take the maximum possible
86  // number of outputs : int_templates->NumClasses.
87  int PruneClasses(const INT_TEMPLATES_STRUCT* int_templates,
88  int num_features,
89  const INT_FEATURE_STRUCT* features,
90  const uinT8* normalization_factors,
91  const uinT16* expected_num_features,
92  CP_RESULT_STRUCT* results);
93  void ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset,
94  CLASS_CUTOFF_ARRAY Cutoffs);
95  void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
96  void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
98  /* normmatch.cpp ************************************************************/
100  const FEATURE_STRUCT& feature, BOOL8 DebugMatch);
101  void FreeNormProtos();
102  NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset);
103  /* protos.cpp ***************************************************************/
104  void ReadClassFile();
105  void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class);
107  const UNICHARSET& target_unicharset);
108  /* adaptmatch.cpp ***********************************************************/
109 
110  // Learn the given word using its chopped_word, seam_array, denorm,
111  // box_word, best_state, and correct_text to learn both correctly and
112  // incorrectly segmented blobs. If filename is not NULL, then LearnBlob
113  // is called and the data will be written to a file for static training.
114  // Otherwise AdaptToBlob is called for adaption within a document.
115  // If rejmap is not NULL, then only chars with a rejmap entry of '1' will
116  // be learned, otherwise all chars with good correct_text are learned.
117  void LearnWord(const char* filename, const char *rejmap, WERD_RES *word);
118 
119  // Builds a blob of length fragments, from the word, starting at start,
120  // and then learn it, as having the given correct_text.
121  // If filename is not NULL, then LearnBlob
122  // is called and the data will be written to a file for static training.
123  // Otherwise AdaptToBlob is called for adaption within a document.
124  // threshold is a magic number required by AdaptToChar and generated by
125  // GetAdaptThresholds.
126  // Although it can be partly inferred from the string, segmentation is
127  // provided to explicitly clarify the character segmentation.
128  void LearnPieces(const char* filename, int start, int length,
129  float threshold, CharSegmentationType segmentation,
130  const char* correct_text, WERD_RES *word);
131  void InitAdaptiveClassifier(bool load_pre_trained_templates);
132  void InitAdaptedClass(TBLOB *Blob,
133  const DENORM& denorm,
134  CLASS_ID ClassId,
135  int FontinfoId,
136  ADAPT_CLASS Class,
137  ADAPT_TEMPLATES Templates);
138  void AdaptToPunc(TBLOB *Blob,
139  const DENORM& denorm,
140  CLASS_ID ClassId,
141  int FontinfoId,
142  FLOAT32 Threshold);
143  void AmbigClassifier(TBLOB *Blob,
144  const DENORM& denorm,
145  INT_TEMPLATES Templates,
146  ADAPT_CLASS *Classes,
147  UNICHAR_ID *Ambiguities,
148  ADAPT_RESULTS *Results);
149  void MasterMatcher(INT_TEMPLATES templates,
150  inT16 num_features,
151  const INT_FEATURE_STRUCT* features,
152  const uinT8* norm_factors,
153  ADAPT_CLASS* classes,
154  int debug,
155  int num_classes,
156  const TBOX& blob_box,
157  CLASS_PRUNER_RESULTS results,
158  ADAPT_RESULTS* final_results);
159  // Converts configs to fonts, and if the result is not adapted, and a
160  // shape_table_ is present, the shape is expanded to include all
161  // unichar_ids represented, before applying a set of corrections to the
162  // distance rating in int_result, (see ComputeCorrectedRating.)
163  // The results are added to the final_results output.
165  bool debug,
166  int class_id,
167  int bottom, int top,
168  float cp_rating,
169  int blob_length,
170  const uinT8* cn_factors,
171  INT_RESULT_STRUCT& int_result,
172  ADAPT_RESULTS* final_results);
173  // Applies a set of corrections to the distance im_rating,
174  // including the cn_correction, miss penalty and additional penalty
175  // for non-alnums being vertical misfits. Returns the corrected distance.
176  double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating,
177  double im_rating, int feature_misses,
178  int bottom, int top,
179  int blob_length, const uinT8* cn_factors);
180  void ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
181  ADAPT_RESULTS *Results,
182  BLOB_CHOICE_LIST *Choices);
183  void AddNewResult(ADAPT_RESULTS *results,
184  CLASS_ID class_id,
185  int shape_id,
186  FLOAT32 rating,
187  bool adapted,
188  int config,
189  int fontinfo_id,
190  int fontinfo_id2);
191  int GetAdaptiveFeatures(TBLOB *Blob,
192  INT_FEATURE_ARRAY IntFeatures,
193  FEATURE_SET *FloatFeatures);
194 
195 #ifndef GRAPHICS_DISABLED
196  void DebugAdaptiveClassifier(TBLOB *Blob,
197  const DENORM& denorm,
198  ADAPT_RESULTS *Results);
199 #endif
200  void GetAdaptThresholds (TWERD * Word,
201  const DENORM& denorm,
202  const WERD_CHOICE& BestChoice,
203  const WERD_CHOICE& BestRawChoice,
204  FLOAT32 Thresholds[]);
205 
207  int NumBadFeat,
208  FEATURE_ID BadFeat[],
209  INT_CLASS IClass,
210  ADAPT_CLASS Class,
213  CLASS_ID ClassId,
214  int FontinfoId,
215  int NumFeatures,
216  INT_FEATURE_ARRAY Features,
217  FEATURE_SET FloatFeatures);
218  void MakePermanent(ADAPT_TEMPLATES Templates,
219  CLASS_ID ClassId,
220  int ConfigId,
221  const DENORM& denorm,
222  TBLOB *Blob);
223  void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
224  void RemoveExtraPuncs(ADAPT_RESULTS *Results);
225  void RemoveBadMatches(ADAPT_RESULTS *Results);
226  void SetAdaptiveThreshold(FLOAT32 Threshold);
227  void ShowBestMatchFor(TBLOB *Blob,
228  const DENORM& denorm,
229  CLASS_ID ClassId,
230  int shape_id,
231  BOOL8 AdaptiveOn,
232  BOOL8 PreTrainedOn,
233  ADAPT_RESULTS *Results);
234  // Returns a string for the classifier class_id: either the corresponding
235  // unicharset debug_str or the shape_table_ debug str.
237  int class_id, int config_id) const;
238  // Converts a classifier class_id index with a config ID to:
239  // shape_table_ present: a shape_table_ index OR
240  // No shape_table_: a font ID.
241  // Without shape training, each class_id, config pair represents a single
242  // unichar id/font combination, so this function looks up the corresponding
243  // font id.
244  // With shape training, each class_id, config pair represents a single
245  // shape table index, so the fontset_table stores the shape table index,
246  // and the shape_table_ must be consulted to obtain the actual unichar_id/
247  // font combinations that the shape represents.
248  int ClassAndConfigIDToFontOrShapeID(int class_id,
249  int int_result_config) const;
250  // Converts a shape_table_ index to a classifier class_id index (not a
251  // unichar-id!). Uses a search, so not fast.
252  int ShapeIDToClassID(int shape_id) const;
254  const DENORM& denorm,
255  ADAPT_TEMPLATES Templates,
256  ADAPT_RESULTS *Results);
257  int CharNormClassifier(TBLOB *Blob,
258  const DENORM& denorm,
259  INT_TEMPLATES Templates,
260  ADAPT_RESULTS *Results);
261 
262  // As CharNormClassifier, but operates on a TrainingSample and outputs to
263  // a GenericVector of ShapeRating without conversion to classes.
264  int CharNormTrainingSample(bool pruner_only, const TrainingSample& sample,
265  GenericVector<ShapeRating>* results);
267  const DENORM& denorm,
268  CLASS_ID CorrectClass);
269  void DoAdaptiveMatch(TBLOB *Blob,
270  const DENORM& denorm,
271  ADAPT_RESULTS *Results);
272  void AdaptToChar(TBLOB *Blob,
273  const DENORM& denorm,
274  CLASS_ID ClassId,
275  int FontinfoId,
276  FLOAT32 Threshold);
277  void DisplayAdaptedChar(TBLOB* blob, const DENORM& denorm,
278  INT_CLASS_STRUCT* int_class);
279  int AdaptableWord(TWERD *Word,
280  const WERD_CHOICE &BestChoiceWord,
281  const WERD_CHOICE &RawChoiceWord);
282  void EndAdaptiveClassifier();
283  void PrintAdaptiveStatistics(FILE *File);
284  void SettupPass1();
285  void SettupPass2();
286  void AdaptiveClassifier(TBLOB *Blob,
287  const DENORM& denorm,
288  BLOB_CHOICE_LIST *Choices,
289  CLASS_PRUNER_RESULTS cp_results);
290  void ClassifyAsNoise(ADAPT_RESULTS *Results);
292 
293  int GetBaselineFeatures(TBLOB *Blob,
294  const DENORM& denorm,
295  INT_TEMPLATES Templates,
296  INT_FEATURE_ARRAY IntFeatures,
297  uinT8* CharNormArray,
298  inT32 *BlobLength);
299  int GetCharNormFeatures(TBLOB *Blob,
300  const DENORM& denorm,
301  INT_TEMPLATES Templates,
302  INT_FEATURE_ARRAY IntFeatures,
303  uinT8* PrunerNormArray,
304  uinT8* CharNormArray,
305  inT32 *BlobLength,
306  inT32 *FeatureOutlineIndex);
307  // Computes the char_norm_array for the unicharset and, if not NULL, the
308  // pruner_array as appropriate according to the existence of the shape_table.
309  // The norm_feature is deleted as it is almost certainly no longer needed.
310  void ComputeCharNormArrays(FEATURE_STRUCT* norm_feature,
311  INT_TEMPLATES_STRUCT* templates,
312  uinT8* char_norm_array,
313  uinT8* pruner_array);
314 
315  bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config);
316  void UpdateAmbigsGroup(CLASS_ID class_id, const DENORM& denorm, TBLOB *Blob);
317 
319  bool AdaptiveClassifierIsFull() { return NumAdaptationsFailed > 0; }
320  bool LooksLikeGarbage(const DENORM& denorm, TBLOB *blob);
321  void RefreshDebugWindow(ScrollView **win, const char *msg,
322  int y_offset, const TBOX &wbox);
323  /* float2int.cpp ************************************************************/
324  void ClearCharNormArray(uinT8* char_norm_array);
325  void ComputeIntCharNormArray(const FEATURE_STRUCT& norm_feature,
326  uinT8* char_norm_array);
327  void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
328  /* intproto.cpp *************************************************************/
329  INT_TEMPLATES ReadIntTemplates(FILE *File);
330  void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
331  const UNICHARSET& target_unicharset);
332  CLASS_ID GetClassToDebug(const char *Prompt, bool* adaptive_on,
333  bool* pretrained_on, int* shape_id);
334  void ShowMatchDisplay();
335  /* font detection ***********************************************************/
337  return fontinfo_table_;
338  }
340  return fontset_table_;
341  }
342  /* mfoutline.cpp ***********************************************************/
343  void NormalizeOutlines(LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale);
344  /* outfeat.cpp ***********************************************************/
346  /* picofeat.cpp ***********************************************************/
348 
349 
350  // Member variables.
351 
352  // Parameters.
354  "Prioritize blob division over chopping");
355  INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");
356  BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier");
357  INT_VAR_H(classify_debug_level, 0, "Classify debug level");
358 
359  /* mfoutline.cpp ***********************************************************/
360  /* control knobs used to control normalization of outlines */
361  INT_VAR_H(classify_norm_method, character, "Normalization Method ...");
363  "Character Normalization Range ...");
364  double_VAR_H(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...");
365  double_VAR_H(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...");
366  double_VAR_H(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...");
367  double_VAR_H(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...");
368 
369  /* adaptmatch.cpp ***********************************************************/
370  BOOL_VAR_H(tess_cn_matching, 0, "Character Normalized Matching");
371  BOOL_VAR_H(tess_bn_matching, 0, "Baseline Normalized Matching");
372  BOOL_VAR_H(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier");
374  "Use pre-adapted classifier templates");
376  "Save adapted templates to a file");
377  BOOL_VAR_H(classify_enable_adaptive_debugger, 0, "Enable match debugger");
378  INT_VAR_H(matcher_debug_level, 0, "Matcher Debug Level");
379  INT_VAR_H(matcher_debug_flags, 0, "Matcher Debug Flags");
380  INT_VAR_H(classify_learning_debug_level, 0, "Learning Debug Level: ");
381  double_VAR_H(matcher_good_threshold, 0.125, "Good Match (0-1)");
382  double_VAR_H(matcher_great_threshold, 0.0, "Great Match (0-1)");
383  double_VAR_H(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)");
384  double_VAR_H(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)");
385  double_VAR_H(matcher_rating_margin, 0.1, "New template margin (0-1)");
386  double_VAR_H(matcher_avg_noise_size, 12.0, "Avg. noise blob length: ");
387  INT_VAR_H(matcher_permanent_classes_min, 1, "Min # of permanent classes");
389  "Reliable Config Threshold");
391  "Enable adaption even if the ambiguities have not been seen");
393  "Maximum angle delta for prototype clustering");
395  "Penalty to apply when a non-alnum is vertically out of "
396  "its expected textline position");
397  double_VAR_H(rating_scale, 1.5, "Rating scaling factor");
398  double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
400  "Scale factor for features not used");
402  "Threshold for good protos during adaptive 0-255");
404  "Threshold for good features during adaptive 0-255");
406  "Do not include character fragments in the"
407  " results of the classifier");
409  "Exclude fragments that do not match any whole character"
410  " with at least this certainty");
412  "Bring up graphical debugging windows for fragments training");
414  "Use two different windows for debugging the matching: "
415  "One for the protos and one for the features.");
416  STRING_VAR_H(classify_learn_debug_str, "", "Class str to debug learning");
417 
418  /* intmatcher.cpp **********************************************************/
420  "Class Pruner Threshold 0-255");
422  "Class Pruner Multiplier 0-255: ");
424  "Class Pruner CutoffStrength: ");
426  "Integer Matcher Multiplier 0-255: ");
427 
428  // Use class variables to hold onto built-in templates and adapted templates.
431 
432  // Create dummy proto and config masks for use with the built-in templates.
440  /* normmatch.cpp */
442  /* font detection ***********************************************************/
444  // Without shape training, each class_id, config pair represents a single
445  // unichar id/font combination, so each fontset_table_ entry holds font ids
446  // for each config in the class.
447  // With shape training, each class_id, config pair represents a single
448  // shape_table_ index, so the fontset_table_ stores the shape_table_ index,
449  // and the shape_table_ must be consulted to obtain the actual unichar_id/
450  // font combinations that the shape represents.
452 
453  INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
455  "Assume the input is numbers [0-9].");
456 
457  protected:
460  // If a shape_table_ is present, it is used to remap classifier output in
461  // ExpandShapesAndApplyCorrections. font_ids referenced by configs actually
462  // mean an index to the shape_table_ and the choices returned are *all* the
463  // shape_table_ entries at that index.
465 
466  private:
467 
468  Dict dict_;
469 
470  /* variables used to hold performance statistics */
471  int AdaptiveMatcherCalls;
472  int BaselineClassifierCalls;
473  int CharNormClassifierCalls;
474  int AmbigClassifierCalls;
475  int NumWordsAdaptedTo;
476  int NumCharsAdaptedTo;
477  int NumBaselineClassesTried;
478  int NumCharNormClassesTried;
479  int NumAmbigClassesTried;
480  int NumClassesOutput;
481  int NumAdaptationsFailed;
482 
483  /* variables used to hold onto extracted features. This is used
484  to map from the old scheme in which baseline features and char norm
485  features are extracted separately, to the new scheme in which they
486  are extracted at the same time. */
487  bool FeaturesHaveBeenExtracted;
488  bool FeaturesOK;
489  INT_FEATURE_ARRAY BaselineFeatures;
490  INT_FEATURE_ARRAY CharNormFeatures;
491  INT_FX_RESULT_STRUCT FXInfo;
492 
493  // Expected number of features in the class pruner, used to penalize
494  // unknowns that have too few features (like a c being classified as e) so
495  // it doesn't recognize everything as '@' or '#'.
496  // CharNormCutoffs is for the static classifier (with no shapetable).
497  // BaselineCutoffs gets a copy of CharNormCutoffs as an estimate of the real
498  // value in the adaptive classifier. Both are indexed by unichar_id.
499  // shapetable_cutoffs_ provides a similar value for each shape in the
500  // shape_table_
501  uinT16* CharNormCutoffs;
502  uinT16* BaselineCutoffs;
503  GenericVector<uinT16> shapetable_cutoffs_;
504  ScrollView* learn_debug_win_;
505  ScrollView* learn_fragmented_word_debug_win_;
506  ScrollView* learn_fragments_debug_win_;
507 };
508 } // namespace tesseract
509 
510 #endif // TESSERACT_CLASSIFY_CLASSIFY_H__
int CharNormClassifier(TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, ADAPT_RESULTS *Results)
int classify_class_pruner_threshold
Definition: classify.h:420
BIT_VECTOR AllProtosOn
Definition: classify.h:433
void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)
Definition: float2int.cpp:69
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset)
Definition: adaptive.cpp:167
double matcher_rating_margin
Definition: classify.h:385
CP_RESULT_STRUCT CLASS_PRUNER_RESULTS[MAX_NUM_CLASSES]
Definition: intmatcher.h:57
IntegerMatcher im_
Definition: classify.h:455
int UNICHAR_ID
Definition: unichar.h:31
void NormalizeOutlines(LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale)
Definition: mfoutline.cpp:346
void AdaptiveClassifier(TBLOB *Blob, const DENORM &denorm, BLOB_CHOICE_LIST *Choices, CLASS_PRUNER_RESULTS cp_results)
Definition: adaptmatch.cpp:178
int classify_adapt_feature_threshold
Definition: classify.h:404
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, const uinT8 *cn_factors)
void DebugAdaptiveClassifier(TBLOB *Blob, const DENORM &denorm, ADAPT_RESULTS *Results)
UnicityTable< FontSet > fontset_table_
Definition: classify.h:451
void InitAdaptedClass(TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
Definition: adaptmatch.cpp:758
void GetAdaptThresholds(TWERD *Word, const DENORM &denorm, const WERD_CHOICE &BestChoice, const WERD_CHOICE &BestRawChoice, FLOAT32 Thresholds[])
NORM_PROTOS * ReadNormProtos(FILE *File, inT64 end_offset)
Definition: normmatch.cpp:230
void MasterMatcher(INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int num_classes, const TBOX &blob_box, CLASS_PRUNER_RESULTS results, ADAPT_RESULTS *final_results)
BIT_VECTOR AllProtosOff
Definition: classify.h:436
void ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset, CLASS_CUTOFF_ARRAY Cutoffs)
Definition: cutoffs.cpp:42
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:239
inT16 PROTO_ID
Definition: matchdefs.h:41
int classify_integer_matcher_multiplier
Definition: classify.h:426
CLASS_ID GetClassToDebug(const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
Definition: intproto.cpp:1432
void LearnPieces(const char *filename, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
Definition: adaptmatch.cpp:394
UnicityTable< FontSet > & get_fontset_table()
Definition: classify.h:339
long long int inT64
Definition: host.h:108
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:228
uinT16 CLASS_CUTOFF_ARRAY[MAX_NUM_CLASSES]
Definition: cutoffs.h:26
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:273
double classify_max_norm_scale_y
Definition: classify.h:367
void DisplayAdaptedChar(TBLOB *blob, const DENORM &denorm, INT_CLASS_STRUCT *int_class)
bool LooksLikeGarbage(const DENORM &denorm, TBLOB *blob)
void LearnWord(const char *filename, const char *rejmap, WERD_RES *word)
Definition: adaptmatch.cpp:254
Definition: cluster.h:32
char * classify_learn_debug_str
Definition: classify.h:416
bool classify_enable_adaptive_debugger
Definition: classify.h:377
void SetAdaptiveThreshold(FLOAT32 Threshold)
void ReadClassFile()
Definition: protos.cpp:293
unsigned char BOOL8
Definition: host.h:113
int classify_adapt_proto_threshold
Definition: classify.h:402
Definition: blobs.h:233
double matcher_clustering_max_angle_delta
Definition: classify.h:393
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config)
virtual ~Classify()
Definition: classify.cpp:178
int inT32
Definition: host.h:102
Definition: rect.h:29
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
Definition: float2int.cpp:94
bool classify_enable_adaptive_matcher
Definition: classify.h:372
void InitAdaptiveClassifier(bool load_pre_trained_templates)
Definition: adaptmatch.cpp:545
#define FALSE
Definition: capi.h:28
float FLOAT32
Definition: host.h:111
int ShapeIDToClassID(int shape_id) const
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:476
ShapeTable * shape_table_
Definition: classify.h:464
UNICHAR_ID * BaselineClassifier(TBLOB *Blob, const DENORM &denorm, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
int GetFontinfoId(ADAPT_CLASS Class, uinT8 ConfigId)
Definition: adaptive.cpp:190
void AddNewResult(ADAPT_RESULTS *results, CLASS_ID class_id, int shape_id, FLOAT32 rating, bool adapted, int config, int fontinfo_id, int fontinfo_id2)
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:430
uinT8 FEATURE_ID
Definition: matchdefs.h:47
int matcher_permanent_classes_min
Definition: classify.h:387
int CharNormTrainingSample(bool pruner_only, const TrainingSample &sample, GenericVector< ShapeRating > *results)
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob)
Definition: picofeat.cpp:57
double classify_char_norm_range
Definition: classify.h:363
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, CP_RESULT_STRUCT *results)
Definition: intmatcher.cpp:406
#define double_VAR_H(name, val, comment)
Definition: params.h:245
int AdaptableWord(TWERD *Word, const WERD_CHOICE &BestChoiceWord, const WERD_CHOICE &RawChoiceWord)
Definition: adaptmatch.cpp:894
void RemoveExtraPuncs(ADAPT_RESULTS *Results)
PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
#define STRING_VAR_H(name, val, comment)
Definition: params.h:242
double matcher_avg_noise_size
Definition: classify.h:386
double matcher_good_threshold
Definition: classify.h:381
Dict & getDict()
Definition: classify.h:62
void ClearCharNormArray(uinT8 *char_norm_array)
Definition: float2int.cpp:48
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:426
void UpdateAmbigsGroup(CLASS_ID class_id, const DENORM &denorm, TBLOB *Blob)
double classify_character_fragments_garbage_certainty_threshold
Definition: classify.h:410
bool disable_character_fragments
Definition: classify.h:407
BIT_VECTOR AllConfigsOff
Definition: classify.h:437
bool classify_bln_numeric_mode
Definition: classify.h:455
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:336
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:1155
Definition: blobs.h:174
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:443
void ResetFeaturesHaveBeenExtracted()
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class)
Definition: intproto.cpp:528
int matcher_min_examples_for_prototyping
Definition: classify.h:389
bool classify_enable_learning
Definition: classify.h:356
uinT32 * BIT_VECTOR
Definition: bitvec.h:28
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
INT_TEMPLATES ReadIntTemplates(FILE *File)
Definition: intproto.cpp:786
void AdaptToChar(TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold)
Definition: adaptmatch.cpp:933
double classify_misfit_junk_penalty
Definition: classify.h:396
unsigned short uinT16
Definition: host.h:101
int matcher_sufficient_examples_for_prototyping
Definition: classify.h:391
CharSegmentationType
Definition: classify.h:51
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const
void RemoveBadMatches(ADAPT_RESULTS *Results)
Definition: strngs.h:40
int classify_learning_debug_level
Definition: classify.h:380
short inT16
Definition: host.h:100
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob)
Definition: outfeat.cpp:36
INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES]
Definition: baseapi.h:66
void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results)
void ClassifyAsNoise(ADAPT_RESULTS *Results)
bool matcher_debug_separate_windows
Definition: classify.h:415
void ExpandShapesAndApplyCorrections(ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, const uinT8 *cn_factors, INT_RESULT_STRUCT &int_result, ADAPT_RESULTS *final_results)
double certainty_scale
Definition: classify.h:398
int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
Definition: adaptmatch.cpp:854
bool AdaptiveClassifierIsFull()
Definition: classify.h:319
#define INT_VAR_H(name, val, comment)
Definition: params.h:236
bool classify_use_pre_adapted_templates
Definition: classify.h:374
unsigned char uinT8
Definition: host.h:99
FLOAT32 ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
Definition: normmatch.cpp:73
bool classify_save_adapted_templates
Definition: classify.h:376
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:507
int classify_cp_cutoff_strength
Definition: classify.h:424
void AdaptToPunc(TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold)
double classify_min_norm_scale_y
Definition: classify.h:366
int GetBaselineFeatures(TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, INT_FEATURE_ARRAY IntFeatures, uinT8 *CharNormArray, inT32 *BlobLength)
INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:573
void MakePermanent(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, const DENORM &denorm, TBLOB *Blob)
double matcher_perfect_threshold
Definition: classify.h:383
void ShowBestMatchFor(TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int shape_id, BOOL8 AdaptiveOn, BOOL8 PreTrainedOn, ADAPT_RESULTS *Results)
void AmbigClassifier(TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, ADAPT_CLASS *Classes, UNICHAR_ID *Ambiguities, ADAPT_RESULTS *Results)
BIT_VECTOR TempProtoMask
Definition: classify.h:438
const ShapeTable * shape_table() const
Definition: classify.h:66
double classify_min_norm_scale_x
Definition: classify.h:364
double tessedit_class_miss_scale
Definition: classify.h:400
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array)
void PrintAdaptiveStatistics(FILE *File)
Definition: adaptmatch.cpp:659
int GetCharNormFeatures(TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, INT_FEATURE_ARRAY IntFeatures, uinT8 *PrunerNormArray, uinT8 *CharNormArray, inT32 *BlobLength, inT32 *FeatureOutlineIndex)
int classify_class_pruner_multiplier
Definition: classify.h:422
ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File)
Definition: adaptive.cpp:371
bool prioritize_division
Definition: classify.h:354
double matcher_bad_match_pad
Definition: classify.h:384
BIT_VECTOR PrunedProtos
Definition: classify.h:434
STRING ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
NORM_PROTOS * NormProtos
Definition: classify.h:441
double matcher_great_threshold
Definition: classify.h:382
BIT_VECTOR AllConfigsOn
Definition: classify.h:435
void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:636
void DoAdaptiveMatch(TBLOB *Blob, const DENORM &denorm, ADAPT_RESULTS *Results)
UNICHAR_ID * GetAmbiguities(TBLOB *Blob, const DENORM &denorm, CLASS_ID CorrectClass)
double classify_max_norm_scale_x
Definition: classify.h:365
bool classify_debug_character_fragments
Definition: classify.h:412
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:35
#define TRUE
Definition: capi.h:27
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:459