Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
reject.cpp File Reference
#include "mfcpch.h"
#include "tessvars.h"
#include "scanutils.h"
#include <ctype.h>
#include <string.h>
#include "memry.h"
#include "reject.h"
#include "tfacep.h"
#include "imgs.h"
#include "control.h"
#include "docqual.h"
#include "secname.h"
#include "globals.h"
#include "helpers.h"
#include "tesseractclass.h"
#include "notdll.h"

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract
 
void reject_blanks (WERD_RES *word)
 
void reject_poor_matches (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
 
float compute_reject_threshold (BLOB_CHOICE_LIST_CLIST *blob_choices)
 

Function Documentation

CLISTIZEH ( STRING  )

Definition at line 55 of file reject.cpp.

63  {
64 void Tesseract::set_done( //set done flag
65  WERD_RES *word,
66  inT16 pass) {
67  /*
68  0: Original heuristic used in Tesseract and Ray's prototype Resaljet
69  */
70  if (tessedit_ok_mode == 0) {
71  /* NOTE - done even if word contains some or all spaces !!! */
72  word->done = word->tess_accepted;
73  }
74  /*
75  1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts
76  */
77  else if (tessedit_ok_mode == 1) {
78  word->done = word->tess_accepted &&
79  (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
80 
81  if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
82  word->done = FALSE;
83  }
84  /*
85  2: as 1 + only accept dict words or numerics in pass 1
86  */
87  else if (tessedit_ok_mode == 2) {
88  word->done = word->tess_accepted &&
89  (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
90 
91  if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
92  word->done = FALSE;
93 
94  if (word->done &&
95  (pass == 1) &&
96  (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
97  (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
98  (word->best_choice->permuter () != USER_DAWG_PERM) &&
99  (word->best_choice->permuter () != NUMBER_PERM)) {
100  #ifndef SECURE_NAMES
101  if (tessedit_rejection_debug)
102  tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
103  word->best_choice->unichar_string().string ());
104  #endif
105  word->done = FALSE;
106  }
107  }
108  /*
109  3: as 2 + only accept dict words or numerics in pass 2 as well
110  */
111  else if (tessedit_ok_mode == 3) {
112  word->done = word->tess_accepted &&
113  (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
114 
115  if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
116  word->done = FALSE;
117 
118  if (word->done &&
119  (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
120  (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
121  (word->best_choice->permuter () != USER_DAWG_PERM) &&
122  (word->best_choice->permuter () != NUMBER_PERM)) {
123  #ifndef SECURE_NAMES
124  if (tessedit_rejection_debug)
125  tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
126  word->best_choice->unichar_string().string ());
127  #endif
128  word->done = FALSE;
129  }
130  }
131  /*
132  4: as 2 + reject dict ambigs in pass 1
133  */
134  else if (tessedit_ok_mode == 4) {
135  word->done = word->tess_accepted &&
136  (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
137 
138  if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
139  word->done = FALSE;
140 
141  if (word->done &&
142  (pass == 1) &&
143  (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
144  (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
145  (word->best_choice->permuter () != USER_DAWG_PERM) &&
146  (word->best_choice->permuter () != NUMBER_PERM)) ||
147  (test_ambig_word (word)))) {
148  #ifndef SECURE_NAMES
149  if (tessedit_rejection_debug)
150  tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
151  word->best_choice->unichar_string().string ());
152  #endif
153  word->done = FALSE;
154  }
155  }
156  /*
157  5: as 3 + reject dict ambigs in both passes
158  */
159  else if (tessedit_ok_mode == 5) {
160  word->done = word->tess_accepted &&
161  (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
162 
163  if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
164  word->done = FALSE;
165 
166  if (word->done &&
167  (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
168  (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
169  (word->best_choice->permuter () != USER_DAWG_PERM) &&
170  (word->best_choice->permuter () != NUMBER_PERM)) ||
171  (test_ambig_word (word)))) {
172  #ifndef SECURE_NAMES
173  if (tessedit_rejection_debug)
174  tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
175  word->best_choice->unichar_string().string ());
176  #endif
177  word->done = FALSE;
178  }
179  }
180 
181  else {
182  tprintf ("BAD tessedit_ok_mode\n");
183  err_exit();
184  }
185 }
186 
187 
188 /*************************************************************************
189  * make_reject_map()
190  *
191  * Sets the done flag to indicate whether the resylt is acceptable.
192  *
193  * Sets a reject map for the word.
194  *************************************************************************/
195 void Tesseract::make_reject_map( //make rej map for wd //detailed results
196  WERD_RES *word,
197  BLOB_CHOICE_LIST_CLIST *blob_choices,
198  ROW *row,
199  inT16 pass //1st or 2nd?
200  ) {
201  int i;
202  int offset;
203 
204  flip_0O(word);
205  check_debug_pt(word, -1); // For trap only
206  set_done(word, pass); // Set acceptance
208  reject_blanks(word);
209  /*
210  0: Rays original heuristic - the baseline
211  */
212  if (tessedit_reject_mode == 0) {
213  if (!word->done)
214  reject_poor_matches(word, blob_choices);
215  } else if (tessedit_reject_mode == 5) {
216  /*
217  5: Reject I/1/l from words where there is no strong contextual confirmation;
218  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
219  and the whole of any words which are very small
220  */
221  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
223  } else {
224  one_ell_conflict(word, TRUE);
225  /*
226  Originally the code here just used the done flag. Now I have duplicated
227  and unpacked the conditions for setting the done flag so that each
228  mechanism can be turned on or off independently. This works WITHOUT
229  affecting the done flag setting.
230  */
231  if (rej_use_tess_accepted && !word->tess_accepted)
233 
234  if (rej_use_tess_blanks &&
235  (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
237 
238  WERD_CHOICE* best_choice = word->best_choice;
239  if (rej_use_good_perm) {
240  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
241  best_choice->permuter() == FREQ_DAWG_PERM ||
242  best_choice->permuter() == USER_DAWG_PERM) &&
243  (!rej_use_sensible_wd ||
244  acceptable_word_string(*word->uch_set,
245  best_choice->unichar_string().string(),
246  best_choice->unichar_lengths().string()) !=
247  AC_UNACCEPTABLE)) {
248  // PASSED TEST
249  } else if (best_choice->permuter() == NUMBER_PERM) {
250  if (rej_alphas_in_number_perm) {
251  for (i = 0, offset = 0;
252  best_choice->unichar_string()[offset] != '\0';
253  offset += best_choice->unichar_lengths()[i++]) {
254  if (word->reject_map[i].accepted() &&
255  word->uch_set->get_isalpha(
256  best_choice->unichar_string().string() + offset,
257  best_choice->unichar_lengths()[i]))
258  word->reject_map[i].setrej_bad_permuter();
259  // rej alpha
260  }
261  }
262  } else {
264  }
265  }
266  /* Ambig word rejection was here once !!*/
267  }
268  } else {
269  tprintf("BAD tessedit_reject_mode\n");
270  err_exit();
271  }
272 
273  if (tessedit_image_border > -1)
274  reject_edge_blobs(word);
275 
276  check_debug_pt (word, 10);
277  if (tessedit_rejection_debug) {
278  tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
279  tprintf("Certainty: %f Rating: %f\n",
280  word->best_choice->certainty (), word->best_choice->rating ());
281  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
282  }
283 
284  flip_hyphens(word);
285  check_debug_pt(word, 20);
286 }
287 } // namespace tesseract
const int kBlnXHeight
Definition: normalis.h:27
const STRING & unichar_string() const
Definition: ratngs.h:395
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
Unacceptable word.
Definition: control.h:37
BOOL8 done
Definition: pageres.h:419
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:440
void err_exit()
Definition: globaloc.cpp:73
inT32 length() const
Definition: strngs.cpp:151
float certainty() const
Definition: ratngs.h:234
void reject_poor_matches(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: reject.cpp:319
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
#define FALSE
Definition: capi.h:28
void flip_0O(WERD_RES *word)
void rej_word_bad_permuter()
Definition: rejctmap.cpp:458
void flip_hyphens(WERD_RES *word)
const UNICHARSET * uch_set
Definition: pageres.h:348
Definition: ocrrow.h:32
uinT8 permuter() const
Definition: ratngs.h:237
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
DENORM denorm
Definition: pageres.h:346
short inT16
Definition: host.h:100
const STRING & unichar_lengths() const
Definition: ratngs.h:402
void rej_word_small_xht()
Definition: rejctmap.cpp:422
BOOL8 tess_accepted
Definition: pageres.h:417
float y_scale() const
Definition: normalis.h:267
void rej_word_contains_blanks()
Definition: rejctmap.cpp:449
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:290
void initialise(inT16 length)
Definition: rejctmap.cpp:324
float rating() const
Definition: ratngs.h:231
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359
float compute_reject_threshold ( BLOB_CHOICE_LIST_CLIST *  blob_choices)

Definition at line 370 of file reject.cpp.

371  {
372  inT16 index; //to ratings
373  inT16 blob_count; //no of blobs in word
374  inT16 ok_blob_count = 0; //non TESS rej blobs in word
375  float *ratings; //array of confidences
376  float threshold; //rejection threshold
377  float bestgap; //biggest gap
378  float gapstart; //bottom of gap
379  //super iterator
380  BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
381  BLOB_CHOICE_IT choice_it; //real iterator
382 
383  blob_count = blob_choices->length ();
384  ratings = (float *) alloc_mem (blob_count * sizeof (float));
385  for (list_it.mark_cycle_pt (), index = 0;
386  !list_it.cycled_list (); list_it.forward (), index++) {
387  choice_it.set_to_list (list_it.data ());
388  if (choice_it.length () > 0) {
389  ratings[ok_blob_count] = choice_it.data ()->certainty ();
390  //get in an array
391  // tprintf("Rating[%d]=%c %g %g\n",
392  // index,choice_it.data()->char_class(),
393  // choice_it.data()->rating(),choice_it.data()->certainty());
394  ok_blob_count++;
395  }
396  }
397  ASSERT_HOST (index == blob_count);
398  qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
399  //sort them
400  bestgap = 0;
401  gapstart = ratings[0] - 1; //all reject if none better
402  if (ok_blob_count >= 3) {
403  for (index = 0; index < ok_blob_count - 1; index++) {
404  if (ratings[index + 1] - ratings[index] > bestgap) {
405  bestgap = ratings[index + 1] - ratings[index];
406  //find biggest
407  gapstart = ratings[index];
408  }
409  }
410  }
411  threshold = gapstart + bestgap / 2;
412  // tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",
413  // ratings[0],ratings[index],bestgap,threshold);
414 
415  free_mem(ratings);
416  return threshold;
417 }
int sort_floats(const void *arg1, const void *arg2)
Definition: helpers.h:46
void free_mem(void *oldchunk)
Definition: memry.cpp:56
void * alloc_mem(inT32 count)
Definition: memry.cpp:48
short inT16
Definition: host.h:100
#define ASSERT_HOST(x)
Definition: errcode.h:84
void reject_blanks ( WERD_RES word)

Definition at line 290 of file reject.cpp.

290  {
291  inT16 i;
292  inT16 offset;
293 
294  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
295  offset += word->best_choice->unichar_lengths()[i], i += 1) {
296  if (word->best_choice->unichar_string()[offset] == ' ')
297  //rej unrecognised blobs
298  word->reject_map[i].setrej_tess_failure ();
299  }
300 }
const STRING & unichar_string() const
Definition: ratngs.h:395
REJMAP reject_map
Definition: pageres.h:408
short inT16
Definition: host.h:100
const STRING & unichar_lengths() const
Definition: ratngs.h:402
WERD_CHOICE * best_choice
Definition: pageres.h:359
void reject_poor_matches ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Definition at line 319 of file reject.cpp.

321  {
322  float threshold;
323  inT16 i = 0;
324  inT16 offset = 0;
325  //super iterator
326  BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
327  BLOB_CHOICE_IT choice_it; //real iterator
328 
329  #ifndef SECURE_NAMES
330  if (strlen(word->best_choice->unichar_lengths().string()) !=
331  list_it.length()) {
332  tprintf
333  ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
334  word->best_choice->unichar_string().string(),
335  strlen (word->best_choice->unichar_lengths().string()), list_it.length(),
336  word->box_word->length());
337  }
338  #endif
339  ASSERT_HOST (strlen (word->best_choice->unichar_lengths().string ()) ==
340  list_it.length ());
341  ASSERT_HOST(word->box_word->length() == list_it.length());
342  threshold = compute_reject_threshold (blob_choices);
343 
344  for (list_it.mark_cycle_pt ();
345  !list_it.cycled_list (); list_it.forward (), i++,
346  offset += word->best_choice->unichar_lengths()[i]) {
347  /* NB - only compares the threshold against the TOP choice char in the
348  choices list for a blob !! - the selected one may be below the threshold
349  */
350  choice_it.set_to_list (list_it.data ());
351  if ((word->best_choice->unichar_string()[offset] == ' ') ||
352  (choice_it.length () == 0))
353  //rej unrecognised blobs
354  word->reject_map[i].setrej_tess_failure ();
355  else if (choice_it.data ()->certainty () < threshold)
356  //rej poor score blob
357  word->reject_map[i].setrej_poor_match ();
358  }
359 }
const STRING & unichar_string() const
Definition: ratngs.h:395
const int length() const
Definition: boxword.h:99
float compute_reject_threshold(BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: reject.cpp:370
REJMAP reject_map
Definition: pageres.h:408
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
short inT16
Definition: host.h:100
const STRING & unichar_lengths() const
Definition: ratngs.h:402
tesseract::BoxWord * box_word
Definition: pageres.h:387
#define ASSERT_HOST(x)
Definition: errcode.h:84
WERD_CHOICE * best_choice
Definition: pageres.h:359