Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
pageres.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pageres.cpp (Formerly page_res.c)
3  * Description: Results classes used by control.c
4  * Author: Phil Cheatle
5  * Created: Tue Sep 22 08:42:49 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 #include "mfcpch.h"
20 #include <stdlib.h>
21 #ifdef __UNIX__
22 #include <assert.h>
23 #endif
24 #include "pageres.h"
25 #include "blobs.h"
26 
27 const char kBlameCorrect[] = "corr";
28 const char kBlameClassifier[] = "cl";
29 const char kBlameChopper[] = "chop";
30 const char kBlameClassLMTradeoff[] = "cl/LM";
31 const char kBlamePageLayout[] = "pglt";
32 const char kBlameSegsearchHeur[] = "ss_heur";
33 const char kBlameSegsearchPP[] = "ss_pp";
34 const char kBlameClassOldLMTradeoff[] = "cl/old_LM";
35 const char kBlameAdaption[] = "adapt";
36 const char kBlameNoTruthSplit[] = "no_tr_spl";
37 const char kBlameNoTruth[] = "no_tr";
38 const char kBlameUnknown[] = "unkn";
39 
40 const char * const kIncorrectResultReasonNames[] = {
53 };
54 
56  return kIncorrectResultReasonNames[irr];
57 }
58 
59 const char *BlamerBundle::IncorrectReason() const {
61 }
62 
64  const WERD_CHOICE *choice,
65  STRING *debug) {
66  (*debug) += "Truth ";
67  for (int i = 0; i < this->truth_text.length(); ++i) {
68  (*debug) += this->truth_text[i];
69  }
70  if (!this->truth_has_char_boxes) (*debug) += " (no char boxes)";
71  if (choice != NULL) {
72  (*debug) += " Choice ";
73  STRING choice_str;
74  choice->string_and_lengths(&choice_str, NULL);
75  (*debug) += choice_str;
76  }
77  if (msg.length() > 0) {
78  (*debug) += "\n";
79  (*debug) += msg;
80  }
81  (*debug) += "\n";
82 }
83 
86 /*************************************************************************
87  * PAGE_RES::PAGE_RES
88  *
89  * Constructor for page results
90  *************************************************************************/
92  BLOCK_LIST *the_block_list,
93  WERD_CHOICE **prev_word_best_choice_ptr) {
94  Init();
95  BLOCK_IT block_it(the_block_list);
96  BLOCK_RES_IT block_res_it(&block_res_list);
97  for (block_it.mark_cycle_pt();
98  !block_it.cycled_list(); block_it.forward()) {
99  block_res_it.add_to_end(new BLOCK_RES(block_it.data()));
100  }
101  prev_word_best_choice = prev_word_best_choice_ptr;
102 }
103 
104 /*************************************************************************
105  * BLOCK_RES::BLOCK_RES
106  *
107  * Constructor for BLOCK results
108  *************************************************************************/
109 
111  ROW_IT row_it (the_block->row_list ());
112  ROW_RES_IT row_res_it(&row_res_list);
113 
114  char_count = 0;
115  rej_count = 0;
116  font_class = -1; //not assigned
117  x_height = -1.0;
119  bold = FALSE;
120  italic = FALSE;
121  row_count = 0;
122 
123  block = the_block;
124 
125  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
126  row_res_it.add_to_end(new ROW_RES(row_it.data()));
127  }
128 }
129 
130 
131 /*************************************************************************
132  * ROW_RES::ROW_RES
133  *
134  * Constructor for ROW results
135  *************************************************************************/
136 
138  WERD_IT word_it(the_row->word_list());
139  WERD_RES_IT word_res_it(&word_res_list);
140  WERD_RES *combo = NULL; // current combination of fuzzies
141  WERD_RES *word_res; // current word
142  WERD *copy_word;
143 
144  char_count = 0;
145  rej_count = 0;
147 
148  row = the_row;
149  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
150  word_res = new WERD_RES(word_it.data());
151  word_res->x_height = the_row->x_height();
152 
153  if (word_res->word->flag(W_FUZZY_NON)) {
154  ASSERT_HOST(combo != NULL);
155  word_res->part_of_combo = TRUE;
156  combo->copy_on(word_res);
157  }
158  if (word_it.data_relative(1)->flag(W_FUZZY_NON)) {
159  if (combo == NULL) {
160  copy_word = new WERD;
161  //deep copy
162  *copy_word = *(word_it.data());
163  combo = new WERD_RES(copy_word);
164  combo->x_height = the_row->x_height();
165  combo->combination = TRUE;
166  word_res_it.add_to_end(combo);
167  }
168  word_res->part_of_combo = TRUE;
169  } else {
170  combo = NULL;
171  }
172  word_res_it.add_to_end(word_res);
173  }
174 }
175 
176 
178  this->ELIST_LINK::operator=(source);
179  Clear();
180  if (source.combination) {
181  word = new WERD;
182  *word = *(source.word); // deep copy
183  } else {
184  word = source.word; // pt to same word
185  }
186  if (source.bln_boxes != NULL)
187  bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
188  if (source.chopped_word != NULL)
189  chopped_word = new TWERD(*source.chopped_word);
190  if (source.rebuild_word != NULL)
191  rebuild_word = new TWERD(*source.rebuild_word);
192  // TODO(rays) Do we ever need to copy the seam_array?
193  denorm = source.denorm;
194  if (source.box_word != NULL)
195  box_word = new tesseract::BoxWord(*source.box_word);
196  best_state = source.best_state;
197  correct_text = source.correct_text;
198 
199  if (source.best_choice != NULL) {
200  best_choice = new WERD_CHOICE(*source.best_choice);
201  raw_choice = new WERD_CHOICE(*source.raw_choice);
203  }
204  else {
205  best_choice = NULL;
206  raw_choice = NULL;
209  }
210  }
211  for (int i = 0; i < source.alt_choices.length(); ++i) {
212  const WERD_CHOICE *choice = source.alt_choices[i];
213  ASSERT_HOST(choice != NULL);
214  alt_choices.push_back(new WERD_CHOICE(*choice));
215  }
216  alt_states = source.alt_states;
217  if (source.ep_choice != NULL) {
218  ep_choice = new WERD_CHOICE(*source.ep_choice);
219  } else {
220  ep_choice = NULL;
221  }
222  reject_map = source.reject_map;
223  combination = source.combination;
224  part_of_combo = source.part_of_combo;
225  CopySimpleFields(source);
226  if (source.blamer_bundle != NULL) {
227  blamer_bundle = new BlamerBundle(*(source.blamer_bundle));
228  }
229  return *this;
230 }
231 
232 // Copies basic fields that don't involve pointers that might be useful
233 // to copy when making one WERD_RES from another.
235  tess_failed = source.tess_failed;
236  tess_accepted = source.tess_accepted;
238  done = source.done;
240  small_caps = source.small_caps;
241  italic = source.italic;
242  bold = source.bold;
243  fontinfo = source.fontinfo;
244  fontinfo2 = source.fontinfo2;
247  x_height = source.x_height;
248  caps_height = source.caps_height;
249  guessed_x_ht = source.guessed_x_ht;
251  reject_spaces = source.reject_spaces;
252  uch_set = source.uch_set;
253  tesseract = source.tesseract;
254 }
255 
256 // Initializes a blank (default constructed) WERD_RES from one that has
257 // already been recognized.
258 // Use SetupFor*Recognition afterwards to complete the setup and make
259 // it ready for a retry recognition.
261  word = source.word;
262  CopySimpleFields(source);
263  if (source.blamer_bundle != NULL) {
264  blamer_bundle = new BlamerBundle();
266  }
267 }
268 
269 // Sets up the members used in recognition:
270 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
271 // Returns false if the word is empty and sets up fake results.
273  tesseract::Tesseract* tess, Pix* pix,
274  bool numeric_mode,
275  bool use_body_size,
276  ROW *row, BLOCK* block) {
277  tesseract = tess;
278  POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
279  if (word->cblob_list()->empty() || (pb != NULL && !pb->IsText())) {
280  // Empty words occur when all the blobs have been moved to the rej_blobs
281  // list, which seems to occur frequently in junk.
282  SetupFake(unicharset_in);
283  word->set_flag(W_REP_CHAR, false);
284  return false;
285  }
286  ClearResults();
287  SetupWordScript(unicharset_in);
289  if (use_body_size && row->body_size() > 0.0f) {
290  chopped_word->SetupBLNormalize(block, row, row->body_size(),
291  numeric_mode, &denorm);
292  } else {
293  chopped_word->SetupBLNormalize(block, row, x_height, numeric_mode, &denorm);
294  }
295  // The image will be 8-bit grey if the input was grey or color. Note that in
296  // a grey image 0 is black and 255 is white. If the input was binary, then
297  // the pix will be binary and 0 is white, with 1 being black.
298  // To tell the difference pixGetDepth() will return 8 or 1.
299  denorm.set_pix(pix);
300  // The inverse flag will be true iff the word has been determined to be white
301  // on black, and is independent of whether the pix is 8 bit or 1 bit.
306  best_choice = new WERD_CHOICE(&unicharset_in);
308  raw_choice = new WERD_CHOICE(&unicharset_in);
309  raw_choice->make_bad();
311  return true;
312 }
313 
314 // Sets up the members used in recognition:
315 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
316 // Returns false if the word is empty and sets up fake results.
318  tesseract::Tesseract* tess,
319  const BLOCK* block) {
320  tesseract = tess;
321  POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
322  if (pb != NULL && !pb->IsText()) {
323  // Ignore words in graphic regions.
324  SetupFake(unicharset_in);
325  word->set_flag(W_REP_CHAR, false);
326  return false;
327  }
328  ClearResults();
329  SetupWordScript(unicharset_in);
330  TBOX word_box = word->bounding_box();
332  word_box.left(), word_box.bottom(),
333  1.0f, 1.0f, 0.0f, 0.0f);
335  return true;
336 }
337 
338 // Sets up the members used in recognition for an empty recognition result:
339 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
340 void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
341  ClearResults();
342  SetupWordScript(unicharset_in);
343  chopped_word = new TWERD;
344  rebuild_word = new TWERD;
347  int blob_count = word->cblob_list()->length();
348  best_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
349  TOP_CHOICE_PERM, unicharset_in);
350  raw_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
351  TOP_CHOICE_PERM, unicharset_in);
352  if (blob_count > 0) {
353  BLOB_CHOICE** fake_choices = new BLOB_CHOICE*[blob_count];
354  // For non-text blocks, just pass any blobs through to the box_word
355  // and call the word failed with a fake classification.
356  C_BLOB_IT b_it(word->cblob_list());
357  int blob_id = 0;
358  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
359  TBOX box = b_it.data()->bounding_box();
360  box_word->InsertBox(box_word->length(), box);
361  fake_choices[blob_id++] = new BLOB_CHOICE(0, 10.0f, -1.0f,
362  -1, -1, -1, 0, 0, false);
363  }
364  FakeClassifyWord(blob_count, fake_choices);
365  delete [] fake_choices;
366  }
367  tess_failed = true;
368 }
369 
371  uch_set = &uch;
372  int script = uch.default_sid();
373  word->set_script_id(script);
375  word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
376 }
377 
378 // Sets up the blamer_bundle if it is not null, using the initialized denorm.
380  if (blamer_bundle != NULL) {
381  blamer_bundle->norm_box_tolerance = kBlamerBoxTolerance * denorm.x_scale();
382  TPOINT topleft;
383  TPOINT botright;
384  TPOINT norm_topleft;
385  TPOINT norm_botright;
386  for (int b = 0; b < blamer_bundle->truth_word.length(); ++b) {
387  const TBOX &box = blamer_bundle->truth_word.BlobBox(b);
388  topleft.x = box.left();
389  topleft.y = box.top();
390  botright.x = box.right();
391  botright.y = box.bottom();
392  denorm.NormTransform(topleft, &norm_topleft);
393  denorm.NormTransform(botright, &norm_botright);
394  TBOX norm_box(norm_topleft.x, norm_botright.y,
395  norm_botright.x, norm_topleft.y);
397  }
398  }
399 }
400 
401 // Simple helper moves the ownership of the pointer data from src to dest,
402 // first deleting anything in dest, and nulling out src afterwards.
403 template<class T> static void MovePointerData(T** dest, T**src) {
404  delete *dest;
405  *dest = *src;
406  *src = NULL;
407 }
408 
409 // Moves the results fields from word to this. This takes ownership of all
410 // the data, so src can be destructed.
412  denorm = word->denorm;
413  MovePointerData(&chopped_word, &word->chopped_word);
414  MovePointerData(&rebuild_word, &word->rebuild_word);
415  MovePointerData(&box_word, &word->box_word);
416  if (seam_array != NULL)
418  seam_array = word->seam_array;
419  word->seam_array = NULL;
420  best_state.move(&word->best_state);
422  MovePointerData(&best_choice, &word->best_choice);
423  MovePointerData(&raw_choice, &word->raw_choice);
425  alt_choices.move(&word->alt_choices);
426  alt_states.move(&word->alt_states);
427  reject_map = word->reject_map;
428  if (word->blamer_bundle != NULL) {
429  assert(blamer_bundle != NULL);
431  }
432  CopySimpleFields(*word);
433 }
434 
435 // Replace the best choice and rebuild box word.
437  const WERD_CHOICE& choice,
438  const GenericVector<int>& segmentation_state) {
439  delete best_choice;
440  best_choice = new WERD_CHOICE(choice);
441  best_state = segmentation_state;
443  SetupBoxWord();
444  // Make up a fake reject map of the right length to keep the
445  // rejection pass happy.
446  reject_map.initialise(segmentation_state.length());
449 }
450 
451 // Builds the rebuild_word from the chopped_word and the best_state.
453  if (rebuild_word != NULL)
454  delete rebuild_word;
455  rebuild_word = new TWERD;
456  if (seam_array == NULL) {
458  }
459  TBLOB* prev_blob = NULL;
460  int start = 0;
461  for (int i = 0; i < best_state.size(); ++i) {
462  int length = best_state[i];
463  join_pieces(chopped_word->blobs, seam_array, start, start + length - 1);
464  TBLOB* blob = chopped_word->blobs;
465  for (int i = 0; i < start; ++i)
466  blob = blob->next;
467  TBLOB* copy_blob = new TBLOB(*blob);
468  if (prev_blob == NULL)
469  rebuild_word->blobs = copy_blob;
470  else
471  prev_blob->next = copy_blob;
472  prev_blob = copy_blob;
473  break_pieces(blob, seam_array, start, start + length - 1);
474  start += length;
475  }
476 }
477 
478 // Copies the chopped_word to the rebuild_word, faking a best_state as well.
479 // Also sets up the output box_word.
481  if (rebuild_word != NULL)
482  delete rebuild_word;
484  SetupBoxWord();
485  int word_len = box_word->length();
486  best_state.reserve(word_len);
487  correct_text.reserve(word_len);
488  for (int i = 0; i < word_len; ++i) {
491  }
492 }
493 
494 // Sets/replaces the box_word with one made from the rebuild_word.
496  if (box_word != NULL)
497  delete box_word;
501 }
502 
503 // Sets up the script positions in the output boxword using the best_choice
504 // to get the unichars, and the unicharset to get the target positions.
507  best_choice);
508 }
509 
510 void WERD_RES::WithoutFootnoteSpan(int *pstart, int *pend) const {
511  int end = best_choice->length();
512  while (end > 0 &&
513  uch_set->get_isdigit(best_choice->unichar_ids()[end - 1]) &&
515  end--;
516  }
517  int start = 0;
518  while (start < end &&
521  start++;
522  }
523  *pstart = start;
524  *pend = end;
525 }
526 
528  const WERD_CHOICE &word, const GenericVector<int> &state,
529  int *pstart, int *pend) const {
530  int len = word.length();
531  *pstart = 0;
532  *pend = len;
533  if (len < 2) return;
534  if (!word.unicharset()->get_isdigit(word.unichar_ids()[len - 1]) &&
535  !word.unicharset()->get_isdigit(word.unichar_ids()[0])) return;
536 
537  // ok, now that we know the word ends in digits, do the expensive bit of
538  // figuring out if they're superscript.
539  WERD_RES copy(*this);
540  copy.ReplaceBestChoice(word, state);
541  copy.WithoutFootnoteSpan(pstart, pend);
542 }
543 
544 // Classifies the word with some already-calculated BLOB_CHOICEs.
545 // The choices are an array of blob_count pointers to BLOB_CHOICE,
546 // providing a single classifier result for each blob.
547 // The BLOB_CHOICEs are consumed and the word takes ownership.
548 // The number of blobs in the outword must match blob_count.
549 void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
550  // Setup the WERD_RES.
552  ASSERT_HOST(blob_count == box_word->length());
554  BLOB_CHOICE_LIST_CLIST* word_choices = new BLOB_CHOICE_LIST_CLIST;
555  BLOB_CHOICE_LIST_C_IT bc_it(word_choices);
556  for (int c = 0; c < blob_count; ++c) {
558  choices[c]->unichar_id(), 1,
559  choices[c]->rating(), choices[c]->certainty());
560  BLOB_CHOICE_LIST* choice_list = new BLOB_CHOICE_LIST;
561  BLOB_CHOICE_IT choice_it(choice_list);
562  choice_it.add_after_then_move(choices[c]);
563  bc_it.add_after_then_move(choice_list);
564  }
565  best_choice->set_blob_choices(word_choices);
566  delete raw_choice;
568  reject_map.initialise(blob_count);
569 }
570 
571 // Copies the best_choice strings to the correct_text for adaption/training.
575  for (int i = 0; i < best_choice->length(); ++i) {
576  UNICHAR_ID choice_id = best_choice->unichar_id(i);
577  const char* blob_choice = uch_set->id_to_unichar(choice_id);
578  correct_text.push_back(STRING(blob_choice));
579  }
580 }
581 
582 // Merges 2 adjacent blobs in the result if the permanent callback
583 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
584 // callback box_cb is NULL or returns true, setting the merged blob
585 // result to the class returned from class_cb.
586 // Returns true if anything was merged.
590 
591  BLOB_CHOICE_LIST_CLIST *blob_choices) {
592  bool modified = false;
593  for (int i = 0; i + 1 < best_choice->length(); ++i) {
594  UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i),
595  best_choice->unichar_id(i+1));
596  if (new_id != INVALID_UNICHAR_ID &&
597  (box_cb == NULL || box_cb->Run(box_word->BlobBox(i),
598  box_word->BlobBox(i + 1)))) {
599  if (reject_map.length() == best_choice->length())
601  best_choice->set_unichar_id(new_id, i);
603  raw_choice->set_unichar_id(new_id, i);
605  modified = true;
606  rebuild_word->MergeBlobs(i, i + 2);
607  box_word->MergeBoxes(i, i + 2);
608  if (i + 1 < best_state.length()) {
609  best_state[i] += best_state[i + 1];
610  best_state.remove(i + 1);
611  }
612 
613  BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices);
614  for (int j = 0; j < i; ++j)
615  blob_choices_it.forward();
616  BLOB_CHOICE_IT it1(blob_choices_it.data()); // first choices
617  BLOB_CHOICE_LIST* target_choices = blob_choices_it.data_relative(1);
618  BLOB_CHOICE_IT it2(target_choices); // second choices
619  float certainty = it2.data()->certainty();
620  float rating = it2.data()->rating();
621  if (it1.data()->certainty() < certainty) {
622  certainty = it1.data()->certainty();
623  rating = it1.data()->rating();
624  target_choices = blob_choices_it.data();
625  blob_choices_it.forward();
626  }
627  delete blob_choices_it.extract(); // get rid of spare
628  // TODO(rays) Fix the choices so they contain the desired result.
629  // Do we really need to ? Only needed for fix_quotes, which should be
630  // going away.
631  }
632  }
633  delete class_cb;
634  delete box_cb;
635  return modified;
636 }
637 
638 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
639 // training data.
640 
641 // Utility function for fix_quotes
642 // Return true if the next character in the string (given the UTF8 length in
643 // bytes) is a quote character.
644 static int is_simple_quote(const char* signed_str, int length) {
645  const unsigned char* str =
646  reinterpret_cast<const unsigned char*>(signed_str);
647  // Standard 1 byte quotes.
648  return (length == 1 && (*str == '\'' || *str == '`')) ||
649  // UTF-8 3 bytes curved quotes.
650  (length == 3 && ((*str == 0xe2 &&
651  *(str + 1) == 0x80 &&
652  *(str + 2) == 0x98) ||
653  (*str == 0xe2 &&
654  *(str + 1) == 0x80 &&
655  *(str + 2) == 0x99)));
656 }
657 
658 // Callback helper for fix_quotes returns a double quote if both
659 // arguments are quote, otherwise INVALID_UNICHAR_ID.
661  const char *ch = uch_set->id_to_unichar(id1);
662  const char *next_ch = uch_set->id_to_unichar(id2);
663  if (is_simple_quote(ch, strlen(ch)) &&
664  is_simple_quote(next_ch, strlen(next_ch)))
665  return uch_set->unichar_to_id("\"");
666  return INVALID_UNICHAR_ID;
667 }
668 
669 // Change pairs of quotes to double quotes.
670 void WERD_RES::fix_quotes(BLOB_CHOICE_LIST_CLIST* blob_choices) {
671  if (!uch_set->contains_unichar("\"") ||
673  return; // Don't create it if it is disallowed.
674 
677  NULL,
678  blob_choices);
679 }
680 
681 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
682 // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
684  const char *ch = uch_set->id_to_unichar(id1);
685  const char *next_ch = uch_set->id_to_unichar(id2);
686  if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
687  (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
688  return uch_set->unichar_to_id("-");
689  return INVALID_UNICHAR_ID;
690 }
691 
692 // Callback helper for fix_hyphens returns true if box1 and box2 overlap
693 // (assuming both on the same textline, are in order and a chopped em dash.)
694 bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
695  return box1.right() >= box2.left();
696 }
697 
698 // Change pairs of hyphens to a single hyphen if the bounding boxes touch
699 // Typically a long dash which has been segmented.
700 void WERD_RES::fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices) {
701  if (!uch_set->contains_unichar("-") ||
703  return; // Don't create it if it is disallowed.
704 
708  blob_choices);
709 }
710 
711 // Callback helper for merge_tess_fails returns a space if both
712 // arguments are space, otherwise INVALID_UNICHAR_ID.
714  if (id1 == id2 && id1 == uch_set->unichar_to_id(" "))
715  return id1;
716  else
717  return INVALID_UNICHAR_ID;
718 }
719 
720 // Change pairs of tess failures to a single one
725  int len = best_choice->length();
726  ASSERT_HOST(reject_map.length() == len);
727  ASSERT_HOST(box_word->length() == len);
728  }
729 }
730 
731 // Returns true if the collection of count pieces, starting at start, are all
732 // natural connected components, ie there are no real chops involved.
733 bool WERD_RES::PiecesAllNatural(int start, int count) const {
734  // all seams must have no splits.
735  for (int index = start; index < start + count - 1; ++index) {
736  if (index >= 0 && index < array_count(seam_array)) {
737  SEAM* seam = reinterpret_cast<SEAM *>(array_value(seam_array, index));
738  if (seam != NULL && seam->split1 != NULL)
739  return false;
740  }
741  }
742  return true;
743 }
744 
745 
747  Clear();
748 }
749 
751  tess_failed = FALSE;
754  done = FALSE;
756  small_caps = false;
757  italic = FALSE;
758  bold = FALSE;
759  // The fontinfos and tesseract count as non-pointers as they point to
760  // data owned elsewhere.
761  fontinfo = NULL;
762  fontinfo2 = NULL;
763  tesseract = NULL;
764  fontinfo_id_count = 0;
765  fontinfo_id2_count = 0;
766  x_height = 0.0;
767  caps_height = 0.0;
768  guessed_x_ht = TRUE;
770  combination = FALSE;
773 }
774 
776  word = NULL;
777  bln_boxes = NULL;
778  uch_set = NULL;
779  chopped_word = NULL;
780  rebuild_word = NULL;
781  box_word = NULL;
782  seam_array = NULL;
783  best_choice = NULL;
784  raw_choice = NULL;
785  ep_choice = NULL;
787 }
788 
790  if (word != NULL && combination) {
791  delete word;
792  }
793  word = NULL;
794  delete blamer_bundle;
796  ClearResults();
797 }
798 
800  done = false;
801  fontinfo = NULL;
802  fontinfo2 = NULL;
803  fontinfo_id_count = 0;
804  fontinfo_id2_count = 0;
805  if (bln_boxes != NULL) {
806  delete bln_boxes;
807  bln_boxes = NULL;
808  }
809  if (chopped_word != NULL) {
810  delete chopped_word;
811  chopped_word = NULL;
812  }
813  if (rebuild_word != NULL) {
814  delete rebuild_word;
815  rebuild_word = NULL;
816  }
817  if (box_word != NULL) {
818  delete box_word;
819  box_word = NULL;
820  }
821  best_state.clear();
823  if (seam_array != NULL) {
825  seam_array = NULL;
826  }
827  if (best_choice != NULL) {
828  delete best_choice;
829  delete raw_choice;
830  best_choice = NULL;
831  raw_choice = NULL;
832  }
833  if (!alt_choices.empty()) {
835  alt_choices.clear();
836  }
837  alt_states.clear();
838  if (ep_choice != NULL) {
839  delete ep_choice;
840  ep_choice = NULL;
841  }
843 }
844 
845 bool PAGE_RES_IT::operator ==(const PAGE_RES_IT &other) const {
846  return word_res == other.word_res &&
847  row_res == other.row_res &&
848  block_res == other.block_res;
849 }
850 
851 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
852  ASSERT_HOST(page_res == other.page_res);
853  if (other.block_res == NULL) {
854  // other points to the end of the page.
855  if (block_res == NULL)
856  return 0;
857  return -1;
858  }
859  if (block_res == NULL) {
860  return 1; // we point to the end of the page.
861  }
862  if (block_res == other.block_res) {
863  if (other.row_res == NULL || row_res == NULL) {
864  // this should only happen if we hit an image block.
865  return 0;
866  }
867  if (row_res == other.row_res) {
868  // we point to the same block and row.
869  ASSERT_HOST(other.word_res != NULL && word_res != NULL);
870  if (word_res == other.word_res) {
871  // we point to the same word!
872  return 0;
873  }
874 
875  WERD_RES_IT word_res_it(&row_res->word_res_list);
876  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
877  word_res_it.forward()) {
878  if (word_res_it.data() == word_res) {
879  return -1;
880  } else if (word_res_it.data() == other.word_res) {
881  return 1;
882  }
883  }
884  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
885  }
886 
887  // we both point to the same block, but different rows.
888  ROW_RES_IT row_res_it(&block_res->row_res_list);
889  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
890  row_res_it.forward()) {
891  if (row_res_it.data() == row_res) {
892  return -1;
893  } else if (row_res_it.data() == other.row_res) {
894  return 1;
895  }
896  }
897  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
898  }
899 
900  // We point to different blocks.
901  BLOCK_RES_IT block_res_it(&page_res->block_res_list);
902  for (block_res_it.mark_cycle_pt();
903  !block_res_it.cycled_list(); block_res_it.forward()) {
904  if (block_res_it.data() == block_res) {
905  return -1;
906  } else if (block_res_it.data() == other.block_res) {
907  return 1;
908  }
909  }
910  // Shouldn't happen...
911  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
912  return 0;
913 }
914 
915 // Inserts the new_word and a corresponding WERD_RES before the current
916 // position. The simple fields of the WERD_RES are copied from clone_res and
917 // the resulting WERD_RES is returned for further setup with best_choice etc.
919  WERD* new_word) {
920  // Insert new_word into the ROW.
921  WERD_IT w_it(row()->row->word_list());
922  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
923  WERD* word = w_it.data();
924  if (word == word_res->word)
925  break;
926  }
927  ASSERT_HOST(!w_it.cycled_list());
928  w_it.add_before_then_move(new_word);
929  // Make a WERD_RES for the new_word.
930  WERD_RES* new_res = new WERD_RES(new_word);
931  new_res->CopySimpleFields(clone_res);
932  // Insert into the appropriate place in the ROW_RES.
933  WERD_RES_IT wr_it(&row()->word_res_list);
934  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
935  WERD_RES* word = wr_it.data();
936  if (word == word_res)
937  break;
938  }
939  ASSERT_HOST(!wr_it.cycled_list());
940  wr_it.add_before_then_move(new_res);
941  if (wr_it.at_first()) {
942  // This is the new first word, so reset the member iterator so it
943  // detects the cycled_list state correctly.
944  ResetWordIterator();
945  }
946  return new_res;
947 }
948 
949 // Deletes the current WERD_RES and its underlying WERD.
951  // Check that this word is as we expect. part_of_combos are NEVER iterated
952  // by the normal iterator, so we should never be trying to delete them.
953  ASSERT_HOST(!word_res->part_of_combo);
954  if (!word_res->combination) {
955  // Combinations own their own word, so we won't find the word on the
956  // row's word_list, but it is legitimate to try to delete them.
957  // Delete word from the ROW when not a combination.
958  WERD_IT w_it(row()->row->word_list());
959  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
960  if (w_it.data() == word_res->word) {
961  break;
962  }
963  }
964  ASSERT_HOST(!w_it.cycled_list());
965  delete w_it.extract();
966  }
967  // Remove the WERD_RES for the new_word.
968  // Remove the WORD_RES from the ROW_RES.
969  WERD_RES_IT wr_it(&row()->word_res_list);
970  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
971  if (wr_it.data() == word_res) {
972  word_res = NULL;
973  break;
974  }
975  }
976  ASSERT_HOST(!wr_it.cycled_list());
977  delete wr_it.extract();
978  ResetWordIterator();
979 }
980 
981 /*************************************************************************
982  * PAGE_RES_IT::restart_page
983  *
984  * Set things up at the start of the page
985  *************************************************************************/
986 
988  block_res_it.set_to_list(&page_res->block_res_list);
989  block_res_it.mark_cycle_pt();
990  prev_block_res = NULL;
991  prev_row_res = NULL;
992  prev_word_res = NULL;
993  block_res = NULL;
994  row_res = NULL;
995  word_res = NULL;
996  next_block_res = NULL;
997  next_row_res = NULL;
998  next_word_res = NULL;
999  internal_forward(true, empty_ok);
1000  return internal_forward(false, empty_ok);
1001 }
1002 
1003 // Recovers from operations on the current word, such as in InsertCloneWord
1004 // and DeleteCurrentWord.
1005 // Resets the word_res_it so that it is one past the next_word_res, as
1006 // it should be after internal_forward. If next_row_res != row_res,
1007 // then the next_word_res is in the next row, so there is no need to do
1008 // anything, since operations on the current word will not have disturbed
1009 // the word_res_it.
1010 void PAGE_RES_IT::ResetWordIterator() {
1011  if (row_res == next_row_res) {
1012  // Reset the member iterator so it can move forward and detect the
1013  // cycled_list state correctly.
1014  word_res_it.move_to_first();
1015  word_res_it.mark_cycle_pt();
1016  while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res)
1017  word_res_it.forward();
1018  ASSERT_HOST(!word_res_it.cycled_list());
1019  word_res_it.forward();
1020  }
1021 }
1022 
1023 /*************************************************************************
1024  * PAGE_RES_IT::internal_forward
1025  *
1026  * Find the next word on the page. If empty_ok is true, then non-text blocks
1027  * and text blocks with no text are visited as if they contain a single
1028  * imaginary word in a single imaginary row. (word() and row() both return NULL
1029  * in such a block and the return value is NULL.)
1030  * If empty_ok is false, the old behaviour is maintained. Each real word
1031  * is visited and empty and non-text blocks and rows are skipped.
1032  * new_block is used to initialize the iterators for a new block.
1033  * The iterator maintains pointers to block, row and word for the previous,
1034  * current and next words. These are correct, regardless of block/row
1035  * boundaries. NULL values denote start and end of the page.
1036  *************************************************************************/
1037 
1038 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
1039  bool new_row = false;
1040 
1041  prev_block_res = block_res;
1042  prev_row_res = row_res;
1043  prev_word_res = word_res;
1044  block_res = next_block_res;
1045  row_res = next_row_res;
1046  word_res = next_word_res;
1047  next_block_res = NULL;
1048  next_row_res = NULL;
1049  next_word_res = NULL;
1050 
1051  while (!block_res_it.cycled_list()) {
1052  if (new_block) {
1053  new_block = false;
1054  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
1055  row_res_it.mark_cycle_pt();
1056  if (row_res_it.empty() && empty_ok) {
1057  next_block_res = block_res_it.data();
1058  break;
1059  }
1060  new_row = true;
1061  }
1062  while (!row_res_it.cycled_list()) {
1063  if (new_row) {
1064  new_row = false;
1065  word_res_it.set_to_list(&row_res_it.data()->word_res_list);
1066  word_res_it.mark_cycle_pt();
1067  }
1068  // Skip any part_of_combo words.
1069  while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
1070  word_res_it.forward();
1071  if (!word_res_it.cycled_list()) {
1072  next_block_res = block_res_it.data();
1073  next_row_res = row_res_it.data();
1074  next_word_res = word_res_it.data();
1075  word_res_it.forward();
1076  goto foundword;
1077  }
1078  // end of row reached
1079  row_res_it.forward();
1080  new_row = true;
1081  }
1082  // end of block reached
1083  block_res_it.forward();
1084  new_block = true;
1085  }
1086  foundword:
1087  // Update prev_word_best_choice pointer.
1090  (new_block || prev_word_res == NULL) ? NULL : prev_word_res->best_choice;
1091  }
1092  return word_res;
1093 }
1094 
1095 /*************************************************************************
1096  * PAGE_RES_IT::restart_row()
1097  *
1098  * Move to the beginning (leftmost word) of the current row.
1099  *************************************************************************/
1101  ROW_RES *row = this->row();
1102  if (!row) return NULL;
1103  for (restart_page(); this->row() != row; forward()) {
1104  // pass
1105  }
1106  return word();
1107 }
1108 
1109 /*************************************************************************
1110  * PAGE_RES_IT::forward_paragraph
1111  *
1112  * Move to the beginning of the next paragraph, allowing empty blocks.
1113  *************************************************************************/
1114 
1116  while (block_res == next_block_res &&
1117  (next_row_res != NULL && next_row_res->row != NULL &&
1118  row_res->row->para() == next_row_res->row->para())) {
1119  internal_forward(false, true);
1120  }
1121  return internal_forward(false, true);
1122 }
1123 
1124 /*************************************************************************
1125  * PAGE_RES_IT::forward_block
1126  *
1127  * Move to the beginning of the next block, allowing empty blocks.
1128  *************************************************************************/
1129 
1131  while (block_res == next_block_res) {
1132  internal_forward(false, true);
1133  }
1134  return internal_forward(false, true);
1135 }
1136 
1138  inT16 chars_in_word;
1139  inT16 rejects_in_word = 0;
1140 
1141  chars_in_word = word_res->reject_map.length ();
1142  page_res->char_count += chars_in_word;
1143  block_res->char_count += chars_in_word;
1144  row_res->char_count += chars_in_word;
1145 
1146  rejects_in_word = word_res->reject_map.reject_count ();
1147 
1148  page_res->rej_count += rejects_in_word;
1149  block_res->rej_count += rejects_in_word;
1150  row_res->rej_count += rejects_in_word;
1151  if (chars_in_word == rejects_in_word)
1152  row_res->whole_word_rej_count += rejects_in_word;
1153 }
static BoxWord * CopyFromNormalized(const DENORM *denorm, TWERD *tessword)
Definition: boxword.cpp:67
int length() const
Definition: ratngs.h:214
void ClearResults()
Definition: pageres.cpp:799
void delete_data_pointers()
TWERD * rebuild_word
Definition: pageres.h:381
const char kBlameSegsearchPP[]
Definition: pageres.cpp:33
bool SetupForTessRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, bool numeric_mode, bool use_body_size, ROW *row, BLOCK *block)
Definition: pageres.cpp:272
const int length() const
Definition: boxword.h:99
SEAMS start_seam_list(TBLOB *blobs)
Definition: seam.cpp:175
const char *const kIncorrectResultReasonNames[]
Definition: pageres.cpp:40
const char * IncorrectReason() const
Definition: pageres.cpp:59
TBOX bounding_box()
Definition: werd.cpp:164
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
bool script_has_xheight() const
Definition: unicharset.h:770
int latin_sid() const
Definition: unicharset.h:754
BOOL8 font_assigned
Definition: pageres.h:264
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
static TWERD * PolygonalCopy(WERD *src)
Definition: blobs.cpp:405
int UNICHAR_ID
Definition: unichar.h:31
const char kBlameNoTruth[]
Definition: pageres.cpp:37
inT32 rej_count
Definition: pageres.h:221
IncorrectResultReason incorrect_result_reason
Definition: pageres.h:176
void InitNonPointers()
Definition: pageres.cpp:750
const UNICHAR_ID * unichar_ids() const
Definition: ratngs.h:217
BLOB_CHOICE_LIST_CLIST * blob_choices()
Definition: ratngs.h:244
BOOL8 done
Definition: pageres.h:419
WERD_RES * forward_block()
Definition: pageres.cpp:1130
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
BLOCK * block
Definition: pageres.h:258
BLOCK_RES()
Definition: pageres.h:271
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) PAGE_RES
Definition: pageres.cpp:85
~WERD_RES()
Definition: pageres.cpp:746
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:660
void set_script_id(int id)
Definition: werd.h:113
ROW_RES * row() const
Definition: pageres.h:760
void set_pix(Pix *pix)
Definition: normalis.h:246
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:918
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:411
BLOCK_RES_LIST block_res_list
Definition: pageres.h:222
const FontInfo * fontinfo
Definition: pageres.h:424
bool SetupForCubeRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, const BLOCK *block)
Definition: pageres.cpp:317
bool ConditionalBlobMerge(TessResultCallback2< UNICHAR_ID, UNICHAR_ID, UNICHAR_ID > *class_cb, TessResultCallback2< bool, const TBOX &, const TBOX & > *box_cb, BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: pageres.cpp:587
virtual void clear()
void SetupBLNormalize(const BLOCK *block, const ROW *row, float x_height, bool numeric_mode, DENORM *denorm) const
Definition: blobs.cpp:424
inT32 length() const
Definition: strngs.cpp:151
void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length, float rating, float certainty)
Definition: ratngs.cpp:313
const FontInfo * fontinfo2
Definition: pageres.h:425
BOOL8 part_of_combo
Definition: pageres.h:451
GenericVector< WERD_CHOICE * > alt_choices
Definition: pageres.h:363
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:987
float x_scale() const
Definition: normalis.h:264
void break_pieces(TBLOB *blobs, SEAMS seams, inT16 start, inT16 end)
Definition: seam.cpp:535
void InitPointers()
Definition: pageres.cpp:775
tesseract::Tesseract * tesseract
Definition: pageres.h:403
void ClearResults()
Definition: pageres.h:103
WERD_RES * restart_page()
Definition: pageres.h:713
WERD_RES * forward_paragraph()
Definition: pageres.cpp:1115
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:321
void merge_tess_fails()
Definition: pageres.cpp:721
REJMAP reject_map
Definition: pageres.h:408
void ClipToOriginalWord(const BLOCK *block, WERD *original_word)
Definition: boxword.cpp:138
void FillDebugString(const STRING &msg, const WERD_CHOICE *choice, STRING *debug)
Definition: pageres.cpp:63
#define NULL
Definition: host.h:144
Definition: blobs.h:233
inT32 char_count
Definition: pageres.h:259
inT16 left() const
Definition: rect.h:67
void MergeBoxes(int start, int end)
Definition: boxword.cpp:177
BOOL8 combination
Definition: pageres.h:450
WERD_RES_LIST word_res_list
Definition: pageres.h:290
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:713
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:121
GenericVector< int > best_state
Definition: pageres.h:392
void SetScriptPositions(const UNICHARSET &unicharset, bool small_caps, TWERD *tessword, WERD_CHOICE *best_choice)
Definition: boxword.cpp:108
BOOL8 guessed_x_ht
Definition: pageres.h:428
void NormTransform(const TPOINT &pt, TPOINT *transformed) const
Definition: normalis.cpp:190
virtual R Run(A1, A2)=0
Definition: rect.h:29
#define f(xc, yc)
Definition: imgscale.cpp:39
BOOL8 bold
Definition: pageres.h:266
#define FALSE
Definition: capi.h:28
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
Definition: pageres.cpp:694
TBLOB * blobs
Definition: blobs.h:274
int push_back(T object)
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
const char kBlamePageLayout[]
Definition: pageres.cpp:31
inT16 right() const
Definition: rect.h:74
const char kBlameAdaption[]
Definition: pageres.cpp:35
inT16 font_class
Definition: pageres.h:261
BOOL8 tess_would_adapt
Definition: pageres.h:418
BOOL8 reject_spaces
Definition: pageres.h:452
void CopyTruth(const BlamerBundle &other)
Definition: pageres.h:117
void join_pieces(TBLOB *piece_blobs, SEAMS seams, inT16 start, inT16 end)
Definition: seam.cpp:564
const char kBlameUnknown[]
Definition: pageres.cpp:38
void set_inverse(bool value)
Definition: normalis.h:252
void reserve(int size)
const char kBlameSegsearchHeur[]
Definition: pageres.cpp:32
inT16 y
Definition: blobs.h:68
GenericVector< GenericVector< int > > alt_states
Definition: pageres.h:364
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:683
const char kBlameCorrect[]
Definition: pageres.cpp:27
ROW_RES()
Definition: pageres.h:292
WERD_RES * word() const
Definition: pageres.h:757
void fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: pageres.cpp:700
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:122
bool operator==(const PAGE_RES_IT &other) const
Definition: pageres.cpp:845
WERD_RES * restart_row()
Definition: pageres.cpp:1100
GenericVector< inT8 > best_choice_fontinfo_ids
Definition: pageres.h:454
tesseract::BoxWord truth_word
Definition: pageres.h:167
void CloneChoppedToRebuild()
Definition: pageres.cpp:480
void remove_pos(inT16 pos)
Definition: rejctmap.cpp:371
SEAMS seam_array
Definition: pageres.h:358
WERD * word
Definition: pageres.h:334
inT16 x
Definition: blobs.h:67
inT8 bold
Definition: pageres.h:422
tesseract::BoxWord norm_truth_word
Definition: pageres.h:170
const UNICHARSET * uch_set
Definition: pageres.h:348
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:733
SPLIT * split1
Definition: seam.h:46
const char kBlameClassOldLMTradeoff[]
Definition: pageres.cpp:34
Definition: ocrrow.h:32
Definition: werd.h:44
inT32 rej_count
Definition: pageres.h:260
IncorrectResultReason
Definition: pageres.h:45
GenericVector< STRING > truth_text
Definition: pageres.h:174
void rej_stat_word()
Definition: pageres.cpp:1137
POLY_BLOCK * poly_block() const
Definition: pdblock.h:62
Definition: blobs.h:53
Definition: blobs.h:174
WERD_CHOICE * ep_choice
Definition: pageres.h:407
bool empty() const
Definition: genericvector.h:68
Definition: ocrblock.h:31
PARA * para() const
Definition: ocrrow.h:112
#define ELISTIZE(CLASSNAME)
Definition: elst.h:992
void Normalize(const DENORM &denorm)
Definition: blobs.cpp:447
BOOL8 guessed_caps_ht
Definition: pageres.h:429
float x_height
Definition: pageres.h:263
void WithoutFootnoteSpan(int *start, int *end) const
Definition: pageres.cpp:510
float x_height
Definition: pageres.h:431
const char kBlameChopper[]
Definition: pageres.cpp:29
inT16 top() const
Definition: rect.h:53
tesseract::BoxWord * bln_boxes
Definition: pageres.h:343
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:123
WERD_RES * forward()
Definition: pageres.h:737
ROW_RES_LIST row_res_list
Definition: pageres.h:269
void free_seam_list(SEAMS seam_list)
Definition: seam.cpp:200
void CopyResults(const BlamerBundle &other)
Definition: pageres.h:124
void remove_unichar_id(int index)
Definition: ratngs.h:357
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:543
int default_sid() const
Definition: unicharset.h:760
WERD_CHOICE ** prev_word_best_choice
Definition: pageres.h:226
DENORM denorm
Definition: pageres.h:346
inT8 fontinfo_id2_count
Definition: pageres.h:427
WERD_CHOICE * raw_choice
Definition: pageres.h:360
inT32 whole_word_rej_count
Definition: pageres.h:289
float x_height() const
Definition: ocrrow.h:61
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: pageres.cpp:55
ScriptPos BlobPosition(int index) const
Definition: boxword.h:105
void BestChoiceToCorrectText()
Definition: pageres.cpp:572
inT16 row_count
Definition: pageres.h:262
Definition: strngs.h:40
Definition: werd.h:60
void move(GenericVector< T > *from)
int size() const
Definition: genericvector.h:59
const char kBlameNoTruthSplit[]
Definition: pageres.cpp:36
short inT16
Definition: host.h:100
void RebuildBestState()
Definition: pageres.cpp:452
inT32 char_count
Definition: pageres.h:287
void SetupNormalization(const BLOCK *block, const ROW *row, const FCOORD *rotation, const DENORM *predecessor, const DENORM_SEG *segs, int num_segs, float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift, float final_yshift)
Definition: normalis.cpp:143
void InsertBox(int index, const TBOX &box)
Definition: boxword.cpp:194
int length() const
Definition: genericvector.h:63
void SetupBlamerBundle()
Definition: pageres.cpp:379
GenericVector< STRING > correct_text
Definition: pageres.h:396
bool truth_has_char_boxes
Definition: pageres.h:164
tesseract::BoxWord * box_word
Definition: pageres.h:387
const TBOX & BlobBox(int index) const
Definition: boxword.h:102
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:247
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:747
BOOL8 tess_accepted
Definition: pageres.h:417
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
virtual void remove(int index)
void CopySimpleFields(const WERD_RES &source)
Definition: pageres.cpp:234
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:340
float body_size() const
Definition: ocrrow.h:70
const BLOCK * block() const
Definition: normalis.h:276
inT8 italic
Definition: pageres.h:421
WERD_LIST * word_list()
Definition: ocrrow.h:52
TWERD * chopped_word
Definition: pageres.h:357
const char kBlameClassLMTradeoff[]
Definition: pageres.cpp:30
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:370
void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: ratngs.cpp:184
inT32 length() const
Definition: rejctmap.h:238
void SetScriptPositions()
Definition: pageres.cpp:505
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:549
void ComputeBoundingBoxes()
Definition: blobs.cpp:477
void Clear()
Definition: pageres.cpp:789
void MergeBlobs(int start, int end)
Definition: blobs.cpp:494
inT32 char_count
Definition: pageres.h:220
inT8 fontinfo_id_count
Definition: pageres.h:426
#define array_count(a)
Definition: tessarray.h:74
BOOL8 tess_failed
Definition: pageres.h:409
#define array_value(a, i)
Definition: tessarray.h:132
BOOL8 italic
Definition: pageres.h:267
void ReplaceBestChoice(const WERD_CHOICE &choice, const GenericVector< int > &segmentation_state)
Definition: pageres.cpp:436
int cmp(const PAGE_RES_IT &other) const
Definition: pageres.cpp:851
inT32 rej_count
Definition: pageres.h:288
float caps_height
Definition: pageres.h:432
bool IsText() const
Definition: polyblk.h:54
void fix_quotes(BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: pageres.cpp:670
#define ASSERT_HOST(x)
Definition: errcode.h:84
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:260
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:430
int count(LIST var_list)
Definition: oldlist.cpp:108
void SetupBoxWord()
Definition: pageres.cpp:495
void copy_on(WERD_RES *word_res)
Definition: pageres.h:674
int norm_box_tolerance
Definition: pageres.h:172
PAGE_RES * page_res
Definition: pageres.h:691
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:294
const char kBlameClassifier[]
Definition: pageres.cpp:28
const UNICHARSET * unicharset() const
Definition: ratngs.h:211
inT16 reject_count()
Definition: rejctmap.h:244
void DeleteCurrentWord()
Definition: pageres.cpp:950
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
void operator=(const ELIST_LINK &)
Definition: elst.h:102
BlamerBundle * blamer_bundle
Definition: pageres.h:367
TBLOB * next
Definition: blobs.h:228
void initialise(inT16 length)
Definition: rejctmap.cpp:324
ROW * row
Definition: pageres.h:286
bool small_caps
Definition: pageres.h:420
#define TRUE
Definition: capi.h:27
inT16 bottom() const
Definition: rect.h:60
WERD_CHOICE * best_choice
Definition: pageres.h:359
WERD_RES & operator=(const WERD_RES &source)
Definition: pageres.cpp:177