Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tfacepp.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tfacepp.cpp (Formerly tface++.c)
3  * Description: C++ side of the C/C++ Tess/Editor interface.
4  * Author: Ray Smith
5  * Created: Thu Apr 23 15:39:23 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #pragma warning(disable:4305) // int/float warnings
23 #pragma warning(disable:4800) // int/bool warnings
24 #endif
25 
26 #include <math.h>
27 
28 #include "mfcpch.h"
29 #ifdef __UNIX__
30 #include <assert.h>
31 #endif
32 #include "errcode.h"
33 #include "ratngs.h"
34 #include "reject.h"
35 #include "werd.h"
36 #include "tfacep.h"
37 #include "tfacepp.h"
38 #include "tessvars.h"
39 #include "globals.h"
40 #include "reject.h"
41 #include "tesseractclass.h"
42 
43 #define MAX_UNDIVIDED_LENGTH 24
44 
45 
46 
47 /**********************************************************************
48  * recog_word
49  *
50  * Convert the word to tess form and pass it to the tess segmenter.
51  * Convert the output back to editor form.
52  **********************************************************************/
53 namespace tesseract {
55  BLOB_CHOICE_LIST_CLIST *blob_choices) {
57  recog_word_recursive(word, blob_choices);
58  word->SetupBoxWord();
59  if ((word->best_choice->length() != word->box_word->length()) ||
60  (word->best_choice->length() != blob_choices->length())) {
61  tprintf("recog_word ASSERT FAIL String:\"%s\"; "
62  "Strlen=%d; #Blobs=%d; #Choices=%d\n",
63  word->best_choice->debug_string().string(),
64  word->best_choice->length(), word->box_word->length(),
65  blob_choices->length());
66  }
67  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
68  ASSERT_HOST(word->best_choice->length() == blob_choices->length());
70  /* Override the permuter type if a straight dictionary check disagrees. */
71  uinT8 perm_type = word->best_choice->permuter();
72  if ((perm_type != SYSTEM_DAWG_PERM) &&
73  (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
74  uinT8 real_dict_perm_type = dict_word(*word->best_choice);
75  if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
76  (real_dict_perm_type == FREQ_DAWG_PERM) ||
77  (real_dict_perm_type == USER_DAWG_PERM)) &&
79  word->best_choice->unichar_lengths().string()) > 0)) {
80  word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
81  }
82  }
84  perm_type != word->best_choice->permuter()) {
85  tprintf("Permuter Type Flipped from %d to %d\n",
86  perm_type, word->best_choice->permuter());
87  }
88  }
89  // Factored out from control.cpp
90  ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL));
91  if (word->best_choice == NULL || word->best_choice->length() == 0 ||
92  strspn(word->best_choice->unichar_string().string(), " ") ==
93  word->best_choice->length()) {
94  word->tess_failed = true;
95  word->reject_map.initialise(word->box_word->length());
97  } else {
98  word->tess_failed = false;
99  }
100 }
101 
102 
103 /**********************************************************************
104  * recog_word_recursive
105  *
106  * Convert the word to tess form and pass it to the tess segmenter.
107  * Convert the output back to editor form.
108  **********************************************************************/
110  BLOB_CHOICE_LIST_CLIST *blob_choices) {
111  int word_length = word->chopped_word->NumBlobs(); // no of blobs
112  if (word_length > MAX_UNDIVIDED_LENGTH) {
113  return split_and_recog_word(word, blob_choices);
114  }
115  int initial_blob_choice_len = blob_choices->length();
116  BLOB_CHOICE_LIST_VECTOR* tess_ratings = cc_recog(word);
117 
118  // Put BLOB_CHOICE_LISTs from tess_ratings into blob_choices.
119  BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices);
120  for (int i = 0; i < tess_ratings->length(); ++i) {
121  blob_choices_it.add_to_end(tess_ratings->get(i));
122  }
123  delete tess_ratings;
124 
125  word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
126  // Pad raw_choice with spaces if needed.
127  if (word->raw_choice->length() < word_length) {
128  UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
129  while (word->raw_choice->length() < word_length) {
130  word->raw_choice->append_unichar_id(space_id, 1, 0.0,
131  word->raw_choice->certainty());
132  }
133  }
134 
135  // Do sanity checks and minor fixes on best_choice.
136  if (word->best_choice->length() > word_length) {
137  word->best_choice->make_bad(); // should never happen
138  tprintf("recog_word: Discarded long string \"%s\""
139  " (%d characters vs %d blobs)\n",
140  word->best_choice->unichar_string().string(),
141  word->best_choice->length(), word_length);
142  tprintf("Word is at:");
143  word->word->bounding_box().print();
144  }
145  if (blob_choices->length() - initial_blob_choice_len != word_length) {
146  word->best_choice->make_bad(); // force rejection
147  tprintf("recog_word: Choices list len:%d; blob lists len:%d\n",
148  blob_choices->length(), word_length);
149  blob_choices_it.set_to_list(blob_choices); // list of lists
150  while (blob_choices->length() - initial_blob_choice_len < word_length) {
151  blob_choices_it.add_to_end(new BLOB_CHOICE_LIST()); // add a fake one
152  tprintf("recog_word: Added dummy choice list\n");
153  }
154  while (blob_choices->length() - initial_blob_choice_len > word_length) {
155  blob_choices_it.move_to_last(); // should never happen
156  delete blob_choices_it.extract();
157  tprintf("recog_word: Deleted choice list\n");
158  }
159  }
160  if (word->best_choice->length() < word_length) {
161  UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
162  while (word->best_choice->length() < word_length) {
163  word->best_choice->append_unichar_id(space_id, 1, 0.0,
164  word->best_choice->certainty());
165  }
166  }
167 }
168 
169 
170 /**********************************************************************
171  * split_and_recog_word
172  *
173  * Split the word into 2 smaller pieces at the largest gap.
174  * Recognize the pieces and stick the results back together.
175  **********************************************************************/
176 
178  BLOB_CHOICE_LIST_CLIST *blob_choices) {
179  // Find the biggest blob gap in the chopped_word.
180  int bestgap = -MAX_INT32;
181  TPOINT best_split_pt;
182  TBLOB* best_end = NULL;
183  TBLOB* prev_blob = NULL;
184  for (TBLOB* blob = word->chopped_word->blobs; blob != NULL;
185  blob = blob->next) {
186  if (prev_blob != NULL) {
187  TBOX prev_box = prev_blob->bounding_box();
188  TBOX blob_box = blob->bounding_box();
189  int gap = blob_box.left() - prev_box.right();
190  if (gap > bestgap) {
191  bestgap = gap;
192  best_end = prev_blob;
193  best_split_pt.x = (prev_box.right() + blob_box.left()) / 2;
194  best_split_pt.y = (prev_box.top() + prev_box.bottom() +
195  blob_box.top() + blob_box.bottom()) / 4;
196  }
197  }
198  prev_blob = blob;
199  }
200  ASSERT_HOST(best_end != NULL);
201  ASSERT_HOST(best_end->next != NULL);
202 
203  // Make a copy of the word to put the 2nd half in.
204  WERD_RES* word2 = new WERD_RES(*word);
205  // Blow away the copied chopped_word, as we want to work with the blobs
206  // from the input chopped_word so the seam_arrays can be merged.
207  delete word2->chopped_word;
208  word2->chopped_word = new TWERD;
209  word2->chopped_word->blobs = best_end->next;
210  best_end->next = NULL;
211  // Make a new seamarray on both words.
212  free_seam_list(word->seam_array);
214  word2->seam_array = start_seam_list(word2->chopped_word->blobs);
215  BlamerBundle *orig_bb = word->blamer_bundle;
216  STRING blamer_debug;
217  // Try to adjust truth information.
218  if (orig_bb != NULL) {
219  // Find truth boxes that correspond to the split in the blobs.
220  int b;
221  int begin2_truth_index = -1;
222  if (orig_bb->incorrect_result_reason != IRR_NO_TRUTH &&
223  orig_bb->truth_has_char_boxes) {
224  int end1_x = best_end->bounding_box().right();
225  int begin2_x = word2->chopped_word->blobs->bounding_box().left();
226  blamer_debug = "Looking for truth split at";
227  blamer_debug.add_str_int(" end1_x ", end1_x);
228  blamer_debug.add_str_int(" begin2_x ", begin2_x);
229  blamer_debug += "\nnorm_truth_word boxes:\n";
230  if (orig_bb->norm_truth_word.length() > 1) {
231  orig_bb->norm_truth_word.BlobBox(0).append_debug(&blamer_debug);
232  for (b = 1; b < orig_bb->norm_truth_word.length(); ++b) {
233  orig_bb->norm_truth_word.BlobBox(b).append_debug(&blamer_debug);
234  if ((abs(end1_x - orig_bb->norm_truth_word.BlobBox(b-1).right()) <
235  orig_bb->norm_box_tolerance) &&
236  (abs(begin2_x - orig_bb->norm_truth_word.BlobBox(b).left()) <
237  orig_bb->norm_box_tolerance)) {
238  begin2_truth_index = b;
239  blamer_debug += "Split found\n";
240  break;
241  }
242  }
243  }
244  }
245  // Populate truth information in word and word2 with the first and second
246  // part of the original truth.
247  word->blamer_bundle = new BlamerBundle();
248  word2->blamer_bundle = new BlamerBundle();
249  if (begin2_truth_index > 0) {
250  word->blamer_bundle->truth_has_char_boxes = true;
252  word2->blamer_bundle->truth_has_char_boxes = true;
254  BlamerBundle *curr_bb = word->blamer_bundle;
255  for (b = 0; b < orig_bb->norm_truth_word.length(); ++b) {
256  if (b == begin2_truth_index) curr_bb = word2->blamer_bundle;
257  curr_bb->norm_truth_word.InsertBox(
258  b, orig_bb->norm_truth_word.BlobBox(b));
259  curr_bb->truth_word.InsertBox(b, orig_bb->truth_word.BlobBox(b));
260  curr_bb->truth_text.push_back(orig_bb->truth_text[b]);
261  }
262  } else if (orig_bb->incorrect_result_reason == IRR_NO_TRUTH) {
265  } else {
266  blamer_debug += "Truth split not found";
267  blamer_debug += orig_bb->truth_has_char_boxes ?
268  "\n" : " (no truth char boxes)\n";
269  word->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
271  word2->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
273  }
274  }
275 
276  // Recognize the first part of the word.
277  recog_word_recursive(word, blob_choices);
278  // Recognize the second part of the word.
279  recog_word_recursive(word2, blob_choices);
280  // Tack the word2 outputs onto the end of the word outputs.
281  // New blobs might have appeared on the end of word1.
282  for (best_end = word->chopped_word->blobs; best_end->next != NULL;
283  best_end = best_end->next);
284  best_end->next = word2->chopped_word->blobs;
285  TBLOB* blob;
286  for (blob = word->rebuild_word->blobs; blob->next != NULL; blob = blob->next);
287  blob->next = word2->rebuild_word->blobs;
288  word2->chopped_word->blobs = NULL;
289  word2->rebuild_word->blobs = NULL;
290  // Copy the seams onto the end of the word1 seam_array.
291  // Since the seam list is one element short, an empty seam marking the
292  // end of the last blob in the first word is needed first.
293  word->seam_array = add_seam(word->seam_array,
294  new_seam(0.0, best_split_pt, NULL, NULL, NULL));
295  for (int i = 0; i < array_count(word2->seam_array); ++i) {
296  SEAM* seam = reinterpret_cast<SEAM*>(array_value(word2->seam_array, i));
297  array_value(word2->seam_array, i) = NULL;
298  word->seam_array = add_seam(word->seam_array, seam);
299  }
300  word->best_state += word2->best_state;
301  // Append the word choices.
302  *word->best_choice += *word2->best_choice;
303  *word->raw_choice += *word2->raw_choice;
304 
305  // How many alt choices from each should we try to get?
306  const int kAltsPerPiece = 2;
307  // When do we start throwing away extra alt choices?
308  const int kTooManyAltChoices = 100;
309 
310  if (word->alt_choices.size() > 0 && word2->alt_choices.size() > 0) {
311  // Construct the cartesian product of the alt choices of word(1) and word2.
312  int num_first_alt_choices = word->alt_choices.size();
313  // Nota Bene: For the main loop here, we leave in place word1-only
314  // alt_choices in
315  // word->alt_choices[0] .. word_alt_choices[num_first_alt_choices - 1]
316  // These will get fused with the best choices for word2 below.
317  for (int j = 1; j < word2->alt_choices.size() &&
318  (j <= kAltsPerPiece || word->alt_choices.size() < kTooManyAltChoices);
319  j++) {
320  for (int i = 0; i < num_first_alt_choices &&
321  (i <= kAltsPerPiece ||
322  word->alt_choices.size() < kTooManyAltChoices);
323  i++) {
324  WERD_CHOICE *wc = new WERD_CHOICE(*word->alt_choices[i]);
325  *wc += *word2->alt_choices[j];
326  word->alt_choices.push_back(wc);
327 
329  GenericVector<int> &alt_state = word->alt_states.back();
330  alt_state += word->alt_states[i];
331  alt_state += word2->alt_states[j];
332  }
333  }
334  // Now that we've filled in as many alternates as we want, paste the best
335  // choice for word2 onto the original word alt_choices.
336  for (int i = 0; i < num_first_alt_choices; i++) {
337  *word->alt_choices[i] += *word2->alt_choices[0];
338  word->alt_states[i] += word2->alt_states[0];
339  }
340  }
341 
342  // Restore the pointer to original blamer bundle and combine blamer
343  // information recorded in the splits.
344  if (orig_bb != NULL) {
346  if (irr != IRR_NO_TRUTH_SPLIT) blamer_debug = "";
350  blamer_debug += "Blame from part 1: ";
351  blamer_debug += word->blamer_bundle->debug;
353  }
357  blamer_debug += "Blame from part 2: ";
358  blamer_debug += word2->blamer_bundle->debug;
359  if (irr == IRR_CORRECT) {
361  } else if (irr != word2->blamer_bundle->incorrect_result_reason) {
362  irr = IRR_UNKNOWN;
363  }
364  }
365  delete word->blamer_bundle;
366  word->blamer_bundle = orig_bb;
368  if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
369  word->blamer_bundle->SetBlame(irr, blamer_debug, NULL,
371  }
372  }
373  delete word2;
374 }
375 
376 } // namespace tesseract
int length() const
Definition: ratngs.h:214
TWERD * rebuild_word
Definition: pageres.h:381
const STRING & unichar_string() const
Definition: ratngs.h:395
const int length() const
Definition: boxword.h:99
SEAMS start_seam_list(TBLOB *blobs)
Definition: seam.cpp:175
TBOX bounding_box()
Definition: werd.cpp:164
int UNICHAR_ID
Definition: unichar.h:31
IncorrectResultReason incorrect_result_reason
Definition: pageres.h:176
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
const STRING debug_string() const
Definition: ratngs.h:373
BLOB_CHOICE_LIST_VECTOR * cc_recog(WERD_RES *word)
Definition: tface.cpp:117
float certainty() const
Definition: ratngs.h:234
void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length, float rating, float certainty)
Definition: ratngs.cpp:313
GenericVector< WERD_CHOICE * > alt_choices
Definition: pageres.h:363
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:321
int NumBlobs() const
Definition: blobs.h:263
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
Definition: blobs.h:233
inT16 left() const
Definition: rect.h:67
GenericVector< int > best_state
Definition: pageres.h:392
void append_debug(STRING *str) const
Definition: rect.h:270
T & get(int index) const
Definition: rect.h:29
TBLOB * blobs
Definition: blobs.h:274
int push_back(T object)
inT16 right() const
Definition: rect.h:74
inT16 y
Definition: blobs.h:68
GenericVector< GenericVector< int > > alt_states
Definition: pageres.h:364
tesseract::BoxWord truth_word
Definition: pageres.h:167
void add_str_int(const char *str, int number)
Definition: strngs.cpp:334
SEAMS seam_array
Definition: pageres.h:358
#define MAX_INT32
Definition: host.h:120
WERD * word
Definition: pageres.h:334
inT16 x
Definition: blobs.h:67
tesseract::BoxWord norm_truth_word
Definition: pageres.h:170
uinT8 permuter() const
Definition: ratngs.h:237
void rej_word_tess_failure()
Definition: rejctmap.cpp:431
IncorrectResultReason
Definition: pageres.h:45
GenericVector< STRING > truth_text
Definition: pageres.h:174
Definition: blobs.h:53
Definition: blobs.h:174
const char * string() const
Definition: strngs.cpp:156
void SetBlame(IncorrectResultReason irr, const STRING &msg, const WERD_CHOICE *choice, bool debug)
Definition: pageres.h:151
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:133
inT16 top() const
Definition: rect.h:53
#define MAX_UNDIVIDED_LENGTH
Definition: tfacepp.cpp:43
void free_seam_list(SEAMS seam_list)
Definition: seam.cpp:200
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool wordrec_debug_blamer
Definition: wordrec.h:142
WERD_CHOICE * raw_choice
Definition: pageres.h:360
TBOX bounding_box() const
Definition: blobs.cpp:384
UNICHARSET unicharset
Definition: ccutil.h:72
Definition: strngs.h:40
int size() const
Definition: genericvector.h:59
inT16 alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:659
const STRING & unichar_lengths() const
Definition: ratngs.h:402
STRING debug
Definition: pageres.h:178
T & back() const
void InsertBox(int index, const TBOX &box)
Definition: boxword.cpp:194
int length() const
Definition: genericvector.h:63
bool truth_has_char_boxes
Definition: pageres.h:164
tesseract::BoxWord * box_word
Definition: pageres.h:387
const TBOX & BlobBox(int index) const
Definition: boxword.h:102
TWERD * chopped_word
Definition: pageres.h:357
unsigned char uinT8
Definition: host.h:99
SEAM * new_seam(PRIORITY priority, const TPOINT &location, SPLIT *split1, SPLIT *split2, SPLIT *split3)
Definition: seam.cpp:421
SEAMS add_seam(SEAMS seam_list, SEAM *seam)
Definition: seam.cpp:104
#define array_count(a)
Definition: tessarray.h:74
BOOL8 tess_failed
Definition: pageres.h:409
#define array_value(a, i)
Definition: tessarray.h:132
#define ASSERT_HOST(x)
Definition: errcode.h:84
void recog_word_recursive(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: tfacepp.cpp:109
void recog_word(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: tfacepp.cpp:54
void split_and_recog_word(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: tfacepp.cpp:177
void SetupBoxWord()
Definition: pageres.cpp:495
void print() const
Definition: rect.h:263
int norm_box_tolerance
Definition: pageres.h:172
BlamerBundle * blamer_bundle
Definition: pageres.h:367
TBLOB * next
Definition: blobs.h:228
void initialise(inT16 length)
Definition: rejctmap.cpp:324
inT16 bottom() const
Definition: rect.h:60
WERD_CHOICE * best_choice
Definition: pageres.h:359
void set_permuter(uinT8 perm)
Definition: ratngs.h:261