55 #define MIN_FONT_ROW_COUNT 8
56 #define MAX_XHEIGHT_DIFF 3
73 TBOX &selection_box) {
79 pseudo_block, pseudo_row);
102 block, row, word_res);
106 (
"\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
129 const TBOX& target_word_box,
130 const char* word_config,
132 if (word_config !=
NULL) {
134 if (backup_config_file_ ==
NULL) {
136 FILE* config_fp = fopen(backup_config_file_,
"wb");
144 if (backup_config_file_ !=
NULL) {
148 backup_config_file_ =
NULL;
151 }
else if (pass > 1 && !word_box.
major_overlap(target_word_box)) {
180 const TBOX* target_word_box,
181 const char* word_config,
200 if (dopasses==0 || dopasses==1) {
214 for (
int i = 0; i < sub_langs_.size(); ++i) {
220 if (monitor !=
NULL) {
222 while (page_res_it.
word() !=
NULL) {
240 most_recently_used_ =
this;
241 while (page_res_it.
word() !=
NULL) {
244 if (monitor !=
NULL) {
252 if (target_word_box &&
254 *target_word_box, word_config, 1)) {
308 if (dopasses == 1)
return true;
313 most_recently_used_ =
this;
317 if (monitor !=
NULL) {
328 if (target_word_box &&
330 *target_word_box, word_config, 2)) {
392 while (page_res_it.
word() !=
NULL) {
413 if (monitor !=
NULL) {
430 if (!word_it.
word())
break;
437 tprintf(
"Skipping because one of the words is W_REP_CHAR\n");
444 tprintf(
"Alt choices not set up for word choice: %s\n",
451 tprintf(
"Alt choices not set up for word choice: %s\n",
480 tprintf(
"Top choice \"%s %s\" verified by bigram model.\n",
486 tprintf(
"Examining alt choices for \"%s %s\".\n",
497 float best_rating = 0.0;
521 if (overrides_word1.
size() == 1 ||
524 best_idx = overrides_word1.
size() - 1;
529 if (overrides_word1.
size() >= 1) {
532 *overrides_word1[best_idx]) &&
534 *overrides_word2[best_idx])) {
536 tprintf(
"Top choice \"%s %s\" verified (sans case) by bigram "
537 "model.\n", orig_w1_str.
string(), orig_w2_str.
string());
541 STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
542 STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
543 if (new_w1_str != orig_w1_str) {
545 *overrides_word1_state[best_idx]);
547 if (new_w2_str != orig_w2_str) {
549 *overrides_word2_state[best_idx]);
552 STRING choices_description;
553 int num_bigram_choices
554 = overrides_word1.
size() * overrides_word2.
size();
555 if (num_bigram_choices == 1) {
556 choices_description =
"This was the unique bigram choice.";
560 const int kMaxChoicesToPrint = 20;
561 for (
int i = 0; i < overrides_word1.
size() &&
562 i < kMaxChoicesToPrint; i++) {
563 if (i > 0) { bigrams_list +=
", "; }
567 if (i == kMaxChoicesToPrint) {
568 bigrams_list +=
" ...";
571 choices_description =
"There were many choices: {";
572 choices_description += bigrams_list;
573 choices_description +=
"}";
575 choices_description.
add_str_int(
"There were ", num_bigram_choices);
576 choices_description +=
" compatible bigrams.";
579 tprintf(
"Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
582 choices_description.
string());
590 const TBOX* target_word_box,
591 const char* word_config) {
600 if (monitor !=
NULL) {
613 if (target_word_box &&
615 *target_word_box, word_config, 4)) {
629 inT16 all_char_quality;
630 inT16 accepted_all_char_quality;
632 &all_char_quality, &accepted_all_char_quality);
635 if ((permuter_type == SYSTEM_DAWG_PERM) ||
636 (permuter_type == FREQ_DAWG_PERM) ||
637 (permuter_type == USER_DAWG_PERM)) {
643 (blob_quality == 0) && (outline_errs >= chars_in_word))
651 (
"QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
652 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
666 BOOL8 good_quality_doc =
705 STRING debug =
"Choice is incorrect after recognition";
722 static_cast<IncorrectResultReason>(bl)),
759 tprintf(
"Retrying word using lang %s, oem %d\n",
769 (this->*recognizer)(block, row, &lang_word);
770 bool new_is_better = NewWordBetter(*word, lang_word);
773 tprintf(
"New result %s better:%s\n",
774 new_is_better ?
"IS" :
"NOT");
776 tprintf(
"New result %s better:%s, r=%g, c=%g\n",
777 new_is_better ?
"IS" :
"NOT",
786 return new_is_better;
800 tprintf(
"Processing word with lang %s at:",
804 const char* result_type =
"Initial";
806 if (initially_done) {
811 result_type =
"Already done";
813 (most_recently_used_->*recognizer)(block, row, word);
815 result_type =
"Accepted";
817 tprintf(
"%s result: %s r=%g, c=%g, accepted=%d, adaptable=%d\n",
826 Tesseract* previous_used = most_recently_used_;
827 if (most_recently_used_ !=
this) {
832 most_recently_used_ =
this;
838 for (
int i = 0; i < sub_langs_.size(); ++i) {
839 if (sub_langs_[i] != previous_used) {
841 tprintf(
"Retrying with sub-Tesseract[%d] lang: %s\n",
845 most_recently_used_ = sub_langs_[i];
867 BLOB_CHOICE_LIST_CLIST *blob_choices =
new BLOB_CHOICE_LIST_CLIST();
917 if (adapt_ok || word->
reject_map[index].accepted())
957 tprintf(
"New XHT Match:%s = %s ",
968 new_x_ht > 0.1 ?
"STILL DOUBT" :
"OK",
969 accept_new_word ?
"ACCEPTED" :
"");
977 bool accept_new_x_ht =
false;
979 if (original_misfits == 0)
982 if (new_x_ht > 0.0
f) {
994 tprintf(
"Old misfits=%d with x-height %f, new=%d with x-height %f\n",
996 new_misfits, new_x_ht);
997 tprintf(
"Old rating= %f, certainty=%f, new=%f, %f\n",
1003 accept_new_x_ht = new_misfits < original_misfits &&
1012 if (accept_new_x_ht) {
1032 bool done_this_pass =
false;
1040 done_this_pass =
TRUE;
1045 bool accept_new_xht =
false;
1051 done_this_pass =
true;
1055 double small_cap_delta = (block->
x_height() - small_cap_xheight) / 2.0;
1057 small_cap_xheight - small_cap_delta <= word->
x_height &&
1058 word->
x_height <= small_cap_xheight + small_cap_delta) {
1068 if (num_upper > 0 && num_lower == 0)
1075 #ifndef GRAPHICS_DISABLED
1101 BLOB_CHOICE_LIST_CLIST *blob_choices =
new BLOB_CHOICE_LIST_CLIST();
1117 tprintf(
"POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1118 " #Blobs=%d; #Choices=%d\n",
1141 BLOB_CHOICE_LIST* bc_list) {
1143 BLOB_CHOICE_IT choice_it(bc_list);
1144 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
1145 choice_it.forward()) {
1161 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
1162 BLOB_CHOICE* choice = FindMatchingChoice(char_id, bc_it.data());
1163 if (choice !=
NULL) {
1165 best_choice = choice;
1174 static void CorrectRepcharChoices(
BLOB_CHOICE* blob_choice,
1178 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
1181 if (choice ==
NULL) {
1182 BLOB_CHOICE_IT choice_it(bc_it.data());
1183 choice_it.add_before_stay_put(
new BLOB_CHOICE(*blob_choice));
1187 for (
int i = 0; i < word->
length(); ++i) {
1207 for (
int i = 0; i < word.
length(); ++i) {
1214 int max_count = rep_ch.MaxCount(&maxch_id);
1216 BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1217 if (best_choice ==
NULL) {
1218 tprintf(
"Failed to find a choice for %s, occurring %d times\n",
1229 C_BLOB* prev_blob = blob_it.data();
1230 for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1231 C_BLOB* blob = blob_it.data();
1233 gap -= prev_blob->bounding_box().right();
1243 CorrectRepcharChoices(best_choice, word_res);
1260 for (; !blob_it.empty(); blob_it.forward()) {
1261 bool first_blob = blob_it.at_first();
1262 bool last_blob = blob_it.at_last();
1285 const UNICHARSET& char_set,
const char *s,
const char *lengths) {
1288 int leading_punct_count;
1289 int upper_count = 0;
1290 int hyphen_pos = -1;
1293 if (strlen (lengths) > 20)
1299 offset += lengths[i++];
1300 leading_punct_count = i;
1303 while (s[offset] !=
'\0' && char_set.
get_isupper(s + offset, lengths[i])) {
1304 offset += lengths[i++];
1307 if (upper_count > 1) {
1311 while (s[offset] !=
'\0' && char_set.
get_islower(s + offset, lengths[i])) {
1312 offset += lengths[i++];
1320 if (lengths[i] == 1 && s[offset] ==
'-') {
1322 offset += lengths[i++];
1323 if (s[offset] !=
'\0') {
1324 while ((s[offset] !=
'\0') &&
1326 offset += lengths[i++];
1328 if (i < hyphen_pos + 3)
1333 if (lengths[i] == 1 && (s[offset] ==
'\'') &&
1334 lengths[i + 1] == 1 && (s[offset + lengths[i]] ==
's')) {
1335 offset += lengths[i++];
1336 offset += lengths[i++];
1339 if (upper_count > 0)
1346 if (lengths[i] == 1 && s[offset] !=
'\0' &&
1348 offset += lengths[i++];
1349 if (lengths[i] == 1 && s[offset] !=
'\0' && i > 0 &&
1350 s[offset - lengths[i - 1]] != s[offset] &&
1352 offset += lengths[i++];
1354 if (s[offset] !=
'\0')
1363 if (s[0] !=
'\0' && char_set.
get_isupper(s, lengths[0])) {
1365 while (s[offset] !=
'\0' &&
1367 lengths[i + 1] == 1 && s[offset + lengths[i]] ==
'.') {
1368 offset += lengths[i++];
1369 offset += lengths[i++];
1372 else if (s[0] !=
'\0' && char_set.
get_islower(s, lengths[0])) {
1374 while (s[offset] !=
'\0' &&
1376 lengths[i + 1] == 1 && s[offset + lengths[i]] ==
'.') {
1377 offset += lengths[i++];
1378 offset += lengths[i++];
1381 if (s[offset] !=
'\0')
1392 #ifndef SECURE_NAMES
1407 tprintf (
"classify_word_pass1 start\n");
1411 tprintf (
"make_reject_map: initial map");
1414 tprintf (
"make_reject_map: after NN");
1417 tprintf (
"classify_word_pass2 - START");
1420 tprintf (
"classify_word_pass2 - Pre Xht");
1423 tprintf (
"classify_word_pass2 - END");
1424 show_map_detail =
TRUE;
1436 tprintf (
"After Poor quality rejection");
1439 tprintf (
"unrej_good_quality_words - START");
1442 tprintf (
"unrej_good_quality_words - END");
1445 tprintf (
"Write results pass");
1446 show_map_detail =
TRUE;
1453 if (show_map_detail) {
1461 tprintf (
"Done flag: %s\n\n", word->
done ?
"TRUE" :
"FALSE");
1474 static void find_modal_font(
1487 fonts->
add (font, -*font_count);
1501 BLOB_CHOICE_LIST_CLIST *blob_choices) {
1502 if (blob_choices ==
NULL)
return;
1509 BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
1510 BLOB_CHOICE_IT choice_it;
1513 if (fontinfo_size == 0 || fontset_size == 0)
return;
1514 STATS fonts(0, fontinfo_size);
1522 for (char_it.mark_cycle_pt(), index = 0;
1523 !char_it.cycled_list(); ++index, char_it.forward()) {
1525 choice_it.set_to_list(char_it.data());
1527 tprintf(
"Examining fonts in %s\n",
1530 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
1531 choice_it.forward()) {
1532 UNICHAR_ID blob_ch_id = choice_it.data()->unichar_id();
1533 if (blob_ch_id == word_ch_id) {
1535 tprintf(
"%s font %s (%d) font2 %s (%d)\n",
1537 choice_it.data()->fontinfo_id() < 0 ?
"unknown" :
1539 choice_it.data()->fontinfo_id(),
1540 choice_it.data()->fontinfo_id2() < 0 ?
"unknown" :
1542 choice_it.data()->fontinfo_id2());
1545 if (choice_it.data()->fontinfo_id() >= 0) {
1546 fonts.
add(choice_it.data()->fontinfo_id(), 2);
1548 if (choice_it.data()->fontinfo_id2() >= 0) {
1549 fonts.
add(choice_it.data()->fontinfo_id2(), 1);
1555 inT16 font_id1, font_id2;
1568 tprintf(
"Word modal font=%s, score=%d, 2nd choice %s/%d\n",
1573 tprintf(
"Word modal font=%s, score=%d. No 2nd choice\n",
1593 STATS doc_fonts(0, font_table_size_);
1598 word = page_res_it.
word();
1607 inT8 doc_font_count;
1608 find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
1609 if (doc_font_count == 0)
1615 word = page_res_it.
word();
1630 word = page_res_it.
word();
1635 if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
void classify_word_and_language(WordRecognizer recognizer, BLOCK *block, ROW *row, WERD_RES *word)
bool ChoiceIsCorrect(const UNICHARSET &uni_set, const WERD_CHOICE *choice, const GenericVector< STRING > &truth_text)
bool SetupForTessRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, bool numeric_mode, bool use_body_size, ROW *row, BLOCK *block)
int tessedit_pageseg_mode
void full_print(FILE *fp)
const STRING & unichar_string() const
inT16 doc_good_char_quality
const char * IncorrectReason() const
BOOL8 recog_interactive(BLOCK *block, ROW *row, WERD_RES *word_res)
C_BLOB_LIST * cblob_list()
bool script_has_xheight() const
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
bool top_bottom_useful() const
const char *const id_to_unichar(UNICHAR_ID id) const
IncorrectResultReason incorrect_result_reason
BLOB_CHOICE_LIST_CLIST * blob_choices()
bool tessedit_display_outwords
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
int tessedit_tess_adaption_mode
const STRING debug_string() const
void cube_word_pass1(BLOCK *block, ROW *row, WERD_RES *word)
bool tessedit_rejection_debug
bool deadline_exceeded() const
bool tessedit_test_adaption
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
char * chs_trailing_punct1
void ConsumeWordResults(WERD_RES *word)
const FontInfo * fontinfo
bool tessedit_debug_fonts
bool tessedit_tess_adapt_to_rejmap
float ComputeCompatibleXheight(WERD_RES *word_res)
bool contains(const FCOORD pt) const
UnicityTable< FontSet > & get_fontset_table()
const FontInfo * fontinfo2
GenericVector< WERD_CHOICE * > alt_choices
int tessedit_ocr_engine_mode
void LearnWord(const char *filename, const char *rejmap, WERD_RES *word)
tesseract::Tesseract * tesseract
double quality_outline_pc
WERD_RES * restart_page()
void font_recognition_pass(PAGE_RES *page_res)
int CountMisfitTops(WERD_RES *word_res)
void FillDebugString(const STRING &msg, const WERD_CHOICE *choice, STRING *debug)
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
void make_reject_map(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices, ROW *row, inT16 pass)
bool get_islower(UNICHAR_ID unichar_id) const
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
void(Tesseract::* WordRecognizer)(BLOCK *block, ROW *row, WERD_RES *word)
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
int tessedit_bigram_debug
BOOL8 contains(const char c) const
EXTERN ScrollView * fx_win
bool tessedit_debug_quality_metrics
#define LOC_WRITE_RESULTS
UNICHAR_ID unichar_id() const
bool get_isupper(UNICHAR_ID unichar_id) const
bool tessedit_reject_bad_qual_wds
bool tessedit_training_tess
bool right_to_left() const
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
inT32 x_height() const
return xheight
STRING debug_str(UNICHAR_ID id) const
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
void add(inT32 value, inT32 count)
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
void set_global_loc_code(int loc_code)
void CopyTruth(const BlamerBundle &other)
char * chs_trailing_punct2
BOOL8 check_debug_pt(WERD_RES *word, int location)
int quality_min_initial_alphas_reqd
GenericVector< GenericVector< int > > alt_states
const double kRepcharGapThreshold
void run_cube_combiner(PAGE_RES *page_res)
void fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices)
BOOL8 flag(WERD_FLAGS mask) const
GenericVector< inT8 > best_choice_fontinfo_ids
void add_str_int(const char *str, int number)
int AdaptableWord(TWERD *Word, const WERD_CHOICE &BestChoiceWord, const WERD_CHOICE &RawChoiceWord)
void CloneChoppedToRebuild()
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
bool tessedit_enable_bigram_correction
BLOCK_RES * block() const
int tessedit_test_adaption_mode
BOOL8 tess_acceptable_word(WERD_CHOICE *word_choice, WERD_CHOICE *raw_choice)
inT32 pile_count(inT32 value) const
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
const UNICHARSET * uch_set
void print_word_alternates_list(WERD_CHOICE *word, GenericVector< WERD_CHOICE * > *alternates)
void rej_word_bad_quality()
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
BOOL8 word_dumper(BLOCK *block, ROW *row, WERD_RES *word_res)
void classify_word_pass1(BLOCK *block, ROW *row, WERD_RES *word)
void rej_word_tess_failure()
bool classify_bln_numeric_mode
void bigram_correction_pass(PAGE_RES *page_res)
void set_global_subloc_code(int loc_code)
GenericVector< STRING > truth_text
UnicityTable< FontInfo > & get_fontinfo_table()
void ExplodeRepeatedWord(BLOB_CHOICE *best_choice, PAGE_RES_IT *page_res_it)
const char * string() const
UnicityTable< FontInfo > fontinfo_table_
void WithoutFootnoteSpan(int *start, int *end) const
void SetBlame(IncorrectResultReason irr, const STRING &msg, const WERD_CHOICE *choice, bool debug)
inT16 word_outline_errs(WERD_RES *word)
bool tessedit_word_for_word
WERD * make_pseudo_word(PAGE_RES *page_res, TBOX &selection_box, BLOCK *&pseudo_block, ROW *&pseudo_row)
DLLSYM void tprintf(const char *format,...)
static const double kXHeightCapRatio
bool wordrec_debug_blamer
bool RetryWithLanguage(WERD_RES *word, BLOCK *block, ROW *row, WordRecognizer recognizer)
void blamer_pass(PAGE_RES *page_res)
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
TBOX bounding_box() const
bool tessedit_minimal_rejection
static const char * IncorrectReasonName(IncorrectResultReason irr)
inT16 word_blob_quality(WERD_RES *word, ROW *row)
void BestChoiceToCorrectText()
bool tessedit_dump_choices
GenericVector< int > blame_reasons
void plot(ScrollView *window)
const char * permuter_name() const
bool tessedit_minimal_rej_pass1
void match_word_pass2(WERD_RES *word, ROW *row, BLOCK *block)
bool tessedit_fix_hyphens
tesseract::BoxWord * box_word
void classify_word_pass2(BLOCK *block, ROW *row, WERD_RES *word)
void set_unichar_id(UNICHAR_ID unichar_id, int index)
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
bool major_overlap(const TBOX &box) const
bool AdaptiveClassifierIsFull()
void fix_rep_char(PAGE_RES_IT *page_res_it)
GenericVector< STRING > misadaption_log
void Add(T value, int count)
void SetupFake(const UNICHARSET &uch)
void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices)
void SetScriptPositions()
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
BOOL8 word_adaptable(WERD_RES *word, uinT16 mode)
void set_word_fonts(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
bool textord_use_cjk_fp_model
bool tessedit_enable_doc_dict
void tess_add_doc_word(WERD_CHOICE *word_choice)
void tess_segment_pass2(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
void ReplaceBestChoice(const WERD_CHOICE &choice, const GenericVector< int > &segmentation_state)
void fix_quotes(BLOB_CHOICE_LIST_CLIST *blob_choices)
void InitForRetryRecognition(const WERD_RES &source)
bool tessedit_fix_fuzzy_spaces
WERD_CHOICE shallow_copy(int start, int end) const
void tess_segment_pass1(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
const UNICHAR_ID unichar_id(int index) const
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
void ResetAdaptiveClassifierInternal()
BlamerBundle * blamer_bundle
void initialise(inT16 length)
const char *const kBackUpConfigFile
WERD_CHOICE * best_choice
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)