Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::ClassPruner Class Reference

Public Member Functions

 ClassPruner (int max_classes)
 
 ~ClassPruner ()
 
void ComputeScores (const INT_TEMPLATES_STRUCT *int_templates, int num_features, const INT_FEATURE_STRUCT *features)
 
void AdjustForExpectedNumFeatures (const uinT16 *expected_num_features, int cutoff_strength)
 
void DisableDisabledClasses (const UNICHARSET &unicharset)
 
void DisableFragments (const UNICHARSET &unicharset)
 
void NormalizeForXheight (int norm_multiplier, const uinT8 *normalization_factors)
 
void NoNormalization ()
 
void PruneAndSort (int pruning_factor, bool max_of_non_fragments, const UNICHARSET &unicharset)
 
void DebugMatch (const Classify &classify, const INT_TEMPLATES_STRUCT *int_templates, const INT_FEATURE_STRUCT *features) const
 
void SummarizeResult (const Classify &classify, const INT_TEMPLATES_STRUCT *int_templates, const uinT16 *expected_num_features, int norm_multiplier, const uinT8 *normalization_factors) const
 
int SetupResults (CP_RESULT_STRUCT *results) const
 

Detailed Description

Definition at line 109 of file intmatcher.cpp.

Constructor & Destructor Documentation

tesseract::ClassPruner::ClassPruner ( int  max_classes)
inline

Definition at line 111 of file intmatcher.cpp.

111  {
112  // The unrolled loop in ComputeScores means that the array sizes need to
113  // be rounded up so that the array is big enough to accommodate the extra
114  // entries accessed by the unrolling. Each pruner word is of sized
115  // BITS_PER_WERD and each entry is NUM_BITS_PER_CLASS, so there are
116  // BITS_PER_WERD / NUM_BITS_PER_CLASS entries.
117  // See ComputeScores.
118  max_classes_ = max_classes;
119  rounded_classes_ = RoundUp(
121  class_count_ = new int[rounded_classes_];
122  norm_count_ = new int[rounded_classes_];
123  sort_key_ = new int[rounded_classes_ + 1];
124  sort_index_ = new int[rounded_classes_ + 1];
125  for (int i = 0; i < rounded_classes_; i++) {
126  class_count_[i] = 0;
127  }
128  pruning_threshold_ = 0;
129  num_features_ = 0;
130  num_classes_ = 0;
131  }
int RoundUp(int n, int block_size)
Definition: helpers.h:58
#define WERDS_PER_CP_VECTOR
Definition: intproto.h:59
#define NUM_BITS_PER_CLASS
Definition: intproto.h:52
#define BITS_PER_WERD
Definition: intproto.h:42
tesseract::ClassPruner::~ClassPruner ( )
inline

Definition at line 133 of file intmatcher.cpp.

133  {
134  delete []class_count_;
135  delete []norm_count_;
136  delete []sort_key_;
137  delete []sort_index_;
138  }

Member Function Documentation

void tesseract::ClassPruner::AdjustForExpectedNumFeatures ( const uinT16 expected_num_features,
int  cutoff_strength 
)
inline

Definition at line 213 of file intmatcher.cpp.

214  {
215  for (int class_id = 0; class_id < max_classes_; ++class_id) {
216  if (num_features_ < expected_num_features[class_id]) {
217  int deficit = expected_num_features[class_id] - num_features_;
218  class_count_[class_id] -= class_count_[class_id] * deficit /
219  (num_features_ * cutoff_strength + deficit);
220  }
221  }
222  }
void tesseract::ClassPruner::ComputeScores ( const INT_TEMPLATES_STRUCT int_templates,
int  num_features,
const INT_FEATURE_STRUCT features 
)
inline

Definition at line 142 of file intmatcher.cpp.

143  {
144  num_features_ = num_features;
145  int num_pruners = int_templates->NumClassPruners;
146  for (int f = 0; f < num_features; ++f) {
147  const INT_FEATURE_STRUCT* feature = &features[f];
148  // Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.
149  int x = feature->X * NUM_CP_BUCKETS >> 8;
150  int y = feature->Y * NUM_CP_BUCKETS >> 8;
151  int theta = feature->Theta * NUM_CP_BUCKETS >> 8;
152  int class_id = 0;
153  // Each CLASS_PRUNER_STRUCT only covers CLASSES_PER_CP(32) classes, so
154  // we need a collection of them, indexed by pruner_set.
155  for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {
156  // Look up quantized feature in a 3-D array, an array of weights for
157  // each class.
158  const uinT32* pruner_word_ptr =
159  int_templates->ClassPruners[pruner_set]->p[x][y][theta];
160  for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) {
161  uinT32 pruner_word = *pruner_word_ptr++;
162  // This inner loop is unrolled to speed up the ClassPruner.
163  // Currently gcc would not unroll it unless it is set to O3
164  // level of optimization or -funroll-loops is specified.
165  /*
166  uinT32 class_mask = (1 << NUM_BITS_PER_CLASS) - 1;
167  for (int bit = 0; bit < BITS_PER_WERD/NUM_BITS_PER_CLASS; bit++) {
168  class_count_[class_id++] += pruner_word & class_mask;
169  pruner_word >>= NUM_BITS_PER_CLASS;
170  }
171  */
172  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
173  pruner_word >>= NUM_BITS_PER_CLASS;
174  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
175  pruner_word >>= NUM_BITS_PER_CLASS;
176  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
177  pruner_word >>= NUM_BITS_PER_CLASS;
178  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
179  pruner_word >>= NUM_BITS_PER_CLASS;
180  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
181  pruner_word >>= NUM_BITS_PER_CLASS;
182  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
183  pruner_word >>= NUM_BITS_PER_CLASS;
184  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
185  pruner_word >>= NUM_BITS_PER_CLASS;
186  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
187  pruner_word >>= NUM_BITS_PER_CLASS;
188  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
189  pruner_word >>= NUM_BITS_PER_CLASS;
190  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
191  pruner_word >>= NUM_BITS_PER_CLASS;
192  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
193  pruner_word >>= NUM_BITS_PER_CLASS;
194  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
195  pruner_word >>= NUM_BITS_PER_CLASS;
196  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
197  pruner_word >>= NUM_BITS_PER_CLASS;
198  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
199  pruner_word >>= NUM_BITS_PER_CLASS;
200  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
201  pruner_word >>= NUM_BITS_PER_CLASS;
202  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
203  }
204  }
205  }
206  }
#define NUM_CP_BUCKETS
Definition: intproto.h:50
#define f(xc, yc)
Definition: imgscale.cpp:39
#define WERDS_PER_CP_VECTOR
Definition: intproto.h:59
uinT32 p[NUM_CP_BUCKETS][NUM_CP_BUCKETS][NUM_CP_BUCKETS][WERDS_PER_CP_VECTOR]
Definition: intproto.h:75
#define NUM_BITS_PER_CLASS
Definition: intproto.h:52
#define CLASS_PRUNER_CLASS_MASK
Definition: intproto.h:53
CLASS_PRUNER_STRUCT * ClassPruners[MAX_NUM_CLASS_PRUNERS]
Definition: intproto.h:123
unsigned int uinT32
Definition: host.h:103
void tesseract::ClassPruner::DebugMatch ( const Classify classify,
const INT_TEMPLATES_STRUCT int_templates,
const INT_FEATURE_STRUCT features 
) const
inline

Definition at line 300 of file intmatcher.cpp.

302  {
303  int num_pruners = int_templates->NumClassPruners;
304  int max_num_classes = int_templates->NumClasses;
305  for (int f = 0; f < num_features_; ++f) {
306  const INT_FEATURE_STRUCT* feature = &features[f];
307  tprintf("F=%3d(%d,%d,%d),", f, feature->X, feature->Y, feature->Theta);
308  // Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.
309  int x = feature->X * NUM_CP_BUCKETS >> 8;
310  int y = feature->Y * NUM_CP_BUCKETS >> 8;
311  int theta = feature->Theta * NUM_CP_BUCKETS >> 8;
312  int class_id = 0;
313  for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {
314  // Look up quantized feature in a 3-D array, an array of weights for
315  // each class.
316  const uinT32* pruner_word_ptr =
317  int_templates->ClassPruners[pruner_set]->p[x][y][theta];
318  for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) {
319  uinT32 pruner_word = *pruner_word_ptr++;
320  for (int word_class = 0; word_class < 16 &&
321  class_id < max_num_classes; ++word_class, ++class_id) {
322  if (norm_count_[class_id] >= pruning_threshold_) {
323  tprintf(" %s=%d,",
324  classify.ClassIDToDebugStr(int_templates,
325  class_id, 0).string(),
326  pruner_word & CLASS_PRUNER_CLASS_MASK);
327  }
328  pruner_word >>= NUM_BITS_PER_CLASS;
329  }
330  }
331  tprintf("\n");
332  }
333  }
334  }
#define NUM_CP_BUCKETS
Definition: intproto.h:50
#define f(xc, yc)
Definition: imgscale.cpp:39
#define WERDS_PER_CP_VECTOR
Definition: intproto.h:59
uinT32 p[NUM_CP_BUCKETS][NUM_CP_BUCKETS][NUM_CP_BUCKETS][WERDS_PER_CP_VECTOR]
Definition: intproto.h:75
#define NUM_BITS_PER_CLASS
Definition: intproto.h:52
#define CLASS_PRUNER_CLASS_MASK
Definition: intproto.h:53
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
CLASS_PRUNER_STRUCT * ClassPruners[MAX_NUM_CLASS_PRUNERS]
Definition: intproto.h:123
unsigned int uinT32
Definition: host.h:103
void tesseract::ClassPruner::DisableDisabledClasses ( const UNICHARSET unicharset)
inline

Definition at line 226 of file intmatcher.cpp.

226  {
227  for (int class_id = 0; class_id < max_classes_; ++class_id) {
228  if (!unicharset.get_enabled(class_id))
229  class_count_[class_id] = 0; // This char is disabled!
230  }
231  }
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:747
void tesseract::ClassPruner::DisableFragments ( const UNICHARSET unicharset)
inline

Definition at line 234 of file intmatcher.cpp.

234  {
235  for (int class_id = 0; class_id < max_classes_; ++class_id) {
236  // Do not include character fragments in the class pruner
237  // results if disable_character_fragments is true.
238  if (unicharset.get_fragment(class_id)) {
239  class_count_[class_id] = 0;
240  }
241  }
242  }
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
void tesseract::ClassPruner::NoNormalization ( )
inline

Definition at line 257 of file intmatcher.cpp.

257  {
258  for (int class_id = 0; class_id < max_classes_; class_id++) {
259  norm_count_[class_id] = class_count_[class_id];
260  }
261  }
void tesseract::ClassPruner::NormalizeForXheight ( int  norm_multiplier,
const uinT8 normalization_factors 
)
inline

Definition at line 248 of file intmatcher.cpp.

249  {
250  for (int class_id = 0; class_id < max_classes_; class_id++) {
251  norm_count_[class_id] = class_count_[class_id] -
252  ((norm_multiplier * normalization_factors[class_id]) >> 8);
253  }
254  }
void tesseract::ClassPruner::PruneAndSort ( int  pruning_factor,
bool  max_of_non_fragments,
const UNICHARSET unicharset 
)
inline

Definition at line 266 of file intmatcher.cpp.

267  {
268  int max_count = 0;
269  for (int c = 0; c < max_classes_; ++c) {
270  if (norm_count_[c] > max_count &&
271  // This additional check is added in order to ensure that
272  // the classifier will return at least one non-fragmented
273  // character match.
274  // TODO(daria): verify that this helps accuracy and does not
275  // hurt performance.
276  (!max_of_non_fragments || !unicharset.get_fragment(c))) {
277  max_count = norm_count_[c];
278  }
279  }
280  // Prune Classes.
281  pruning_threshold_ = (max_count * pruning_factor) >> 8;
282  // Select Classes.
283  if (pruning_threshold_ < 1)
284  pruning_threshold_ = 1;
285  num_classes_ = 0;
286  for (int class_id = 0; class_id < max_classes_; class_id++) {
287  if (norm_count_[class_id] >= pruning_threshold_) {
288  ++num_classes_;
289  sort_index_[num_classes_] = class_id;
290  sort_key_[num_classes_] = norm_count_[class_id];
291  }
292  }
293 
294  // Sort Classes using Heapsort Algorithm.
295  if (num_classes_ > 1)
296  HeapSort(num_classes_, sort_key_, sort_index_);
297  }
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
void HeapSort(int n, register int ra[], register int rb[])
int tesseract::ClassPruner::SetupResults ( CP_RESULT_STRUCT results) const
inline

Definition at line 360 of file intmatcher.cpp.

360  {
361  for (int c = 0; c < num_classes_; ++c) {
362  results[c].Class = sort_index_[num_classes_ - c];
363  results[c].Rating = 1.0 - sort_key_[num_classes_ - c] /
364  (static_cast<float>(CLASS_PRUNER_CLASS_MASK) * num_features_);
365  }
366  return num_classes_;
367  }
FLOAT32 Rating
Definition: intmatcher.h:52
#define CLASS_PRUNER_CLASS_MASK
Definition: intproto.h:53
CLASS_ID Class
Definition: intmatcher.h:54
void tesseract::ClassPruner::SummarizeResult ( const Classify classify,
const INT_TEMPLATES_STRUCT int_templates,
const uinT16 expected_num_features,
int  norm_multiplier,
const uinT8 normalization_factors 
) const
inline

Definition at line 337 of file intmatcher.cpp.

341  {
342  tprintf("CP:%d classes, %d features:\n", num_classes_, num_features_);
343  for (int i = 0; i < num_classes_; ++i) {
344  int class_id = sort_index_[num_classes_ - i];
345  STRING class_string = classify.ClassIDToDebugStr(int_templates,
346  class_id, 0);
347  tprintf("%s:Initial=%d, E=%d, Xht-adj=%d, N=%d, Rat=%.2f\n",
348  class_string.string(),
349  class_count_[class_id],
350  expected_num_features[class_id],
351  (norm_multiplier * normalization_factors[class_id]) >> 8,
352  sort_key_[num_classes_ - i],
353  100.0 - 100.0 * sort_key_[num_classes_ - i] /
354  (CLASS_PRUNER_CLASS_MASK * num_features_));
355  }
356  }
const char * string() const
Definition: strngs.cpp:156
#define CLASS_PRUNER_CLASS_MASK
Definition: intproto.h:53
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
Definition: strngs.h:40

The documentation for this class was generated from the following file: