21 #pragma warning(disable:4244) // Conversion warnings
22 #pragma warning(disable:4305) // int/float warnings
64 void Tesseract::set_done(
70 if (tessedit_ok_mode == 0) {
77 else if (tessedit_ok_mode == 1) {
81 if (word->
done && (pass == 1) && one_ell_conflict (word,
FALSE))
87 else if (tessedit_ok_mode == 2) {
91 if (word->
done && (pass == 1) && one_ell_conflict (word,
FALSE))
101 if (tessedit_rejection_debug)
102 tprintf (
"\nVETO Tess accepting poor word \"%s\"\n",
111 else if (tessedit_ok_mode == 3) {
115 if (word->
done && (pass == 1) && one_ell_conflict (word,
FALSE))
124 if (tessedit_rejection_debug)
125 tprintf (
"\nVETO Tess accepting poor word \"%s\"\n",
134 else if (tessedit_ok_mode == 4) {
138 if (word->
done && (pass == 1) && one_ell_conflict (word,
FALSE))
147 (test_ambig_word (word)))) {
149 if (tessedit_rejection_debug)
150 tprintf (
"\nVETO Tess accepting poor word \"%s\"\n",
159 else if (tessedit_ok_mode == 5) {
163 if (word->
done && (pass == 1) && one_ell_conflict (word,
FALSE))
171 (test_ambig_word (word)))) {
173 if (tessedit_rejection_debug)
174 tprintf (
"\nVETO Tess accepting poor word \"%s\"\n",
182 tprintf (
"BAD tessedit_ok_mode\n");
195 void Tesseract::make_reject_map(
197 BLOB_CHOICE_LIST_CLIST *blob_choices,
205 check_debug_pt(word, -1);
206 set_done(word, pass);
212 if (tessedit_reject_mode == 0) {
215 }
else if (tessedit_reject_mode == 5) {
224 one_ell_conflict(word,
TRUE);
234 if (rej_use_tess_blanks &&
239 if (rej_use_good_perm) {
240 if ((best_choice->
permuter() == SYSTEM_DAWG_PERM ||
241 best_choice->
permuter() == FREQ_DAWG_PERM ||
242 best_choice->
permuter() == USER_DAWG_PERM) &&
243 (!rej_use_sensible_wd ||
244 acceptable_word_string(*word->
uch_set,
249 }
else if (best_choice->
permuter() == NUMBER_PERM) {
250 if (rej_alphas_in_number_perm) {
251 for (i = 0, offset = 0;
269 tprintf(
"BAD tessedit_reject_mode\n");
273 if (tessedit_image_border > -1)
274 reject_edge_blobs(word);
276 check_debug_pt (word, 10);
277 if (tessedit_rejection_debug) {
279 tprintf(
"Certainty: %f Rating: %f\n",
285 check_debug_pt(word, 20);
321 BLOB_CHOICE_LIST_CLIST *blob_choices) {
326 BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
327 BLOB_CHOICE_IT choice_it;
333 (
"ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
344 for (list_it.mark_cycle_pt ();
345 !list_it.cycled_list (); list_it.forward (), i++,
350 choice_it.set_to_list (list_it.data ());
352 (choice_it.length () == 0))
355 else if (choice_it.data ()->certainty () < threshold)
371 BLOB_CHOICE_LIST_CLIST *blob_choices) {
374 inT16 ok_blob_count = 0;
380 BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
381 BLOB_CHOICE_IT choice_it;
383 blob_count = blob_choices->length ();
384 ratings = (
float *)
alloc_mem (blob_count *
sizeof (
float));
385 for (list_it.mark_cycle_pt (), index = 0;
386 !list_it.cycled_list (); list_it.forward (), index++) {
387 choice_it.set_to_list (list_it.data ());
388 if (choice_it.length () > 0) {
389 ratings[ok_blob_count] = choice_it.data ()->certainty ();
398 qsort (ratings, ok_blob_count,
sizeof (
float),
sort_floats);
401 gapstart = ratings[0] - 1;
402 if (ok_blob_count >= 3) {
403 for (index = 0; index < ok_blob_count - 1; index++) {
404 if (ratings[index + 1] - ratings[index] > bestgap) {
405 bestgap = ratings[index + 1] - ratings[index];
407 gapstart = ratings[index];
411 threshold = gapstart + bestgap / 2;
437 for (
int blobindex = 0; blobindex < blobcount; blobindex++) {
443 word->
reject_map[blobindex].setrej_edge_char();
460 inT16 first_alphanum_index_;
461 inT16 first_alphanum_offset_;
464 BOOL8 non_conflict_set_char;
468 BOOL8 dict_perm_type;
474 word_len = strlen (lengths);
487 for (i = 0, offset = 0, non_conflict_set_char =
FALSE;
488 (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
489 non_conflict_set_char =
493 if (!non_conflict_set_char) {
511 dict_word_ok = (dict_word_type > 0) &&
516 (dict_perm_type && dict_word_ok)) {
519 if (lengths[first_alphanum_index_] == 1 &&
520 word[first_alphanum_offset_] ==
'I') {
526 setrej_1Il_conflict();
535 if (lengths[first_alphanum_index_] == 1 &&
536 word[first_alphanum_offset_] ==
'l') {
542 setrej_1Il_conflict();
566 if (lengths[first_alphanum_index_] == 1 &&
567 word[first_alphanum_offset_] ==
'l') {
574 else if (lengths[first_alphanum_index_] == 1 &&
575 word[first_alphanum_offset_] ==
'I') {
594 for (i = 0, offset = 0; word[offset] !=
'\0';
596 if ((!allow_1s || (word[offset] !=
'1')) &&
599 word_res->
reject_map[i].setrej_1Il_conflict ();
616 setrej_1Il_conflict ();
634 const char *word_lengths) {
638 for (i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[i++]) {
647 const char *word_lengths) {
651 for (i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[i++]) {
660 const char *word_lengths) {
665 for (i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[i++]) {
674 const char *word_lengths) {
678 for (i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[i++]) {
680 (word_lengths[i] != 1 || word[offset] !=
'1'))
713 for (i = 0, offset = 0; i < word_len;
728 for (i = 0, offset = 0; i < word_len;
763 inT16 accepted_char_quality;
780 (char_quality == accepted_char_quality))
789 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
799 int prev_right = -9999;
809 bool modified =
false;
810 for (i = 0; i < best_choice->
length() && blob !=
NULL; ++i,
812 out_box = blob->bounding_box();
813 if (blob->next ==
NULL)
816 next_left = blob->next->bounding_box().
left();
819 (out_box.
left() > prev_right) && (out_box.
right() < next_left)) {
820 aspect_ratio = out_box.
width() / (float) out_box.
height();
829 word_res->
reject_map[i].setrej_hyphen_accept();
836 else if (best_choice->
unichar_id(i) == unichar_dash) {
839 word_res->
reject_map[i].setrej_hyphen_accept();
848 prev_right = out_box.
right();
865 for (i = 0; i < best_choice->
length() && blob !=
NULL; ++i,
869 out_box = blob->bounding_box();
877 if (unichar_0 == INVALID_UNICHAR_ID ||
879 unichar_O == INVALID_UNICHAR_ID ||
883 bool modified =
false;
884 for (i = 1; i < best_choice->
length(); ++i) {
885 if (best_choice->
unichar_id(i) == unichar_0 ||
888 if ((i+1) < best_choice->
length() &&
896 (i+1) < best_choice->
length() &&
899 (i+2) < best_choice->
length() &&
909 (((i+1) < best_choice->
length() &&
913 (i == best_choice->
length() - 1))) {
919 (i+1) < best_choice->
length() &&
926 (i+2) < best_choice->
length() &&
939 (i+2) < best_choice->
length() &&
950 (i+1) < best_choice->
length() &&
960 if (best_choice->
unichar_id(i-2) == unichar_O) {
964 while (i < best_choice->length() &&
978 return ch_set.
get_isupper(unichar_id) && !ch_set.
eq(unichar_id,
"O");
982 return ch_set.
get_isdigit(unichar_id) && !ch_set.
eq(unichar_id,
"0");
const STRING & unichar_string() const
void reject_edge_blobs(WERD_RES *word)
bool get_isalpha(UNICHAR_ID unichar_id) const
float compute_reject_threshold(BLOB_CHOICE_LIST_CLIST *blob_choices)
void flip_hyphens(WERD_RES *word)
bool contains_unichar_id(UNICHAR_ID unichar_id) const
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
char * conflict_set_I_l_1
const int kBlnBaselineOffset
void reject_mostly_rejects(WERD_RES *word)
void rej_word_not_tess_accepted()
void reject_I_1_L(WERD_RES *word)
inT16 first_alphanum_index(const char *word, const char *word_lengths)
tesseract::Tesseract * tesseract
void flip_0O(WERD_RES *word)
void reject_poor_matches(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
int sort_floats(const void *arg1, const void *arg2)
void free_mem(void *oldchunk)
bool rej_1Il_use_dict_word
BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map)
BOOL8 contains(const char c) const
bool get_isupper(UNICHAR_ID unichar_id) const
bool get_isdigit(UNICHAR_ID unichar_id) const
void flip_0O(WERD_RES *word)
void * alloc_mem(inT32 count)
void rej_word_bad_permuter()
double tessedit_lower_flip_hyphen
void flip_hyphens(WERD_RES *word)
inT16 safe_dict_word(const WERD_RES *werd_res)
inT16 count_alphanums(const WERD_CHOICE &word)
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, BLOB_CHOICE_LIST_VECTOR *Choices, bool *modified_blobs)
const UNICHARSET * uch_set
bool rej_1Il_trust_permuter_type
double tessedit_upper_flip_hyphen
const char * string() const
void dont_allow_1Il(WERD_RES *word)
int dict_word(const WERD_CHOICE &word)
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
BOOL8 non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
DLLSYM void tprintf(const char *format,...)
inT16 first_alphanum_offset(const char *word, const char *word_lengths)
inT16 alpha_count(const char *word, const char *word_lengths)
#define CLISTIZE(CLASSNAME)
const STRING & unichar_lengths() const
void rej_word_small_xht()
double rej_whole_of_mostly_reject_word_fract
tesseract::BoxWord * box_word
const TBOX & BlobBox(int index) const
void set_unichar_id(UNICHAR_ID unichar_id, int index)
bool get_enabled(UNICHAR_ID unichar_id) const
BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths)
BOOL8 non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
BOOL8 test_ambig_word(WERD_RES *word)
char * ok_repeated_ch_non_alphanum_wds
int tessedit_image_border
CLISTIZEH(STRING) CLISTIZE(STRING) namespace tesseract
void rej_word_contains_blanks()
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
void rej_word_mostly_rej()
const UNICHAR_ID unichar_id(int index) const
void reject_blanks(WERD_RES *word)
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
void initialise(inT16 length)
WERD_CHOICE * best_choice