25 #pragma warning(disable:4244) // Conversion warnings
35 probability_in_context_(&
tesseract::
Dict::def_probability_in_context),
36 image_ptr_(image_ptr),
38 "A list of user-provided words.",
39 getImage()->getCCUtil()->params()),
41 "A list of user-provided patterns.",
42 getImage()->getCCUtil()->params()),
44 getImage()->getCCUtil()->params()),
46 getImage()->getCCUtil()->params()),
48 getImage()->getCCUtil()->params()),
50 " patterns.", getImage()->getCCUtil()->params()),
52 " patterns.", getImage()->getCCUtil()->params()),
54 " (e.g. for non-space delimited languages)",
55 getImage()->getCCUtil()->params()),
57 "bigrams.", getImage()->getCCUtil()->params()),
59 "Score multiplier for word matches which have good case and"
60 "are frequent in the given language (lower is better).",
61 getImage()->getCCUtil()->params()),
63 "Score multiplier for word matches that have good case "
64 "(lower is better).", getImage()->getCCUtil()->params()),
66 "Default score multiplier for word matches, which may have "
67 "case issues (lower is better).",
68 getImage()->getCCUtil()->params()),
70 "Multipler to for the best choice from the ngram model.",
71 getImage()->getCCUtil()->params()),
73 "Score multiplier for glyph fragment segmentations which "
74 "do not match a dictionary word (lower is better).",
75 getImage()->getCCUtil()->params()),
77 "Score multiplier for poorly cased strings that are not in"
78 " the dictionary and generally look like garbage (lower is"
79 " better).", getImage()->getCCUtil()->params()),
81 "Output file for ambiguities found in the dictionary",
82 getImage()->getCCUtil()->params()),
83 INT_MEMBER(dawg_debug_level, 0,
"Set to 1 for general debug info"
84 ", to 2 for more details, to 3 to see all the debug messages",
85 getImage()->getCCUtil()->params()),
86 INT_MEMBER(hyphen_debug_level, 0,
"Debug level for hyphenated words.",
87 getImage()->getCCUtil()->params()),
88 INT_MEMBER(max_viterbi_list_size, 10,
"Maximum size of viterbi list.",
89 getImage()->getCCUtil()->params()),
91 "Use only the first UTF8 step of the given string"
92 " when computing log probabilities.",
93 getImage()->getCCUtil()->params()),
94 double_MEMBER(certainty_scale, 20.0,
"Certainty scaling factor",
95 getImage()->getCCUtil()->params()),
97 "Certainty threshold for non-dict words",
98 getImage()->getCCUtil()->params()),
100 "Reject certainty offset",
101 getImage()->getCCUtil()->params()),
103 "Size of dict word to be treated as non-dict word",
104 getImage()->getCCUtil()->params()),
105 double_MEMBER(stopper_certainty_per_char, -0.50,
"Certainty to add"
106 " for each dict char above small word size.",
107 getImage()->getCCUtil()->params()),
109 "Max certaintly variation allowed in a word (in sigma)",
110 getImage()->getCCUtil()->params()),
111 INT_MEMBER(stopper_debug_level, 0,
"Stopper debug level",
112 getImage()->getCCUtil()->params()),
114 "Make AcceptableChoice() always return false. Useful"
115 " when there is a need to explore all segmentations",
116 getImage()->getCCUtil()->params()),
118 "Gain factor for ambiguity threshold.",
119 getImage()->getCCUtil()->params()),
121 "Certainty offset for ambiguity threshold.",
122 getImage()->getCCUtil()->params()),
123 BOOL_MEMBER(save_raw_choices, false,
"Save all explored raw choices",
124 getImage()->getCCUtil()->params()),
125 INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
126 "Max words to keep in list",
127 getImage()->getCCUtil()->params()),
128 STRING_MEMBER(word_to_debug,
"",
"Word for which stopper debug"
129 " information should be printed to stdout",
130 getImage()->getCCUtil()->params()),
132 "Lengths of unichars in word_to_debug",
133 getImage()->getCCUtil()->params()),
135 getImage()->getCCUtil()->params()),
137 getImage()->getCCUtil()->params()),
139 getImage()->getCCUtil()->params()),
140 double_MEMBER(bestrate_pruning_factor, 2.0,
"Multiplying factor of"
141 " current best rate to prune other hypotheses",
142 getImage()->getCCUtil()->params()),
144 "Turn on word script consistency permuter",
145 getImage()->getCCUtil()->params()),
147 "incorporate segmentation cost in word rating?",
148 getImage()->getCCUtil()->params()),
150 "Don't use any alphabetic-specific tricks."
151 "Set to true in the traineddata config file for"
152 " scripts that are cursive or inherently fixed-pitch",
153 getImage()->getCCUtil()->params()),
155 "Score multipler for script consistency within a word. "
156 "Being a 'reward' factor, it should be <= 1. "
157 "Smaller value implies bigger reward.",
158 getImage()->getCCUtil()->params()),
160 "Turn on fixed-length phrasebook search permuter",
161 getImage()->getCCUtil()->params()),
163 "Turn on character type (property) consistency permuter",
164 getImage()->getCCUtil()->params()),
166 "Score multipler for char type consistency within a word. ",
167 getImage()->getCCUtil()->params()),
169 "Score multipler for ngram permuter's best choice"
170 " (only used in the Han script path).",
171 getImage()->getCCUtil()->params()),
172 BOOL_MEMBER(save_doc_words, 0,
"Save Document Words",
173 getImage()->getCCUtil()->params()),
174 BOOL_MEMBER(doc_dict_enable, 1,
"Enable Document Dictionary ",
175 getImage()->getCCUtil()->params()),
177 "Worst certainty for using pending dictionary",
178 getImage()->getCCUtil()->params()),
180 "Worst certainty for words that can be inserted into the"
181 "document dictionary", getImage()->getCCUtil()->params()),
183 "Activate character-level n-gram-based permuter",
184 getImage()->getCCUtil()->params()),
186 " character choices to consider during permutation."
187 " This limit is especially useful when user patterns"
188 " are specified, since overly generic patterns can result in"
189 " dawg search exploring an overly large number of options.",
190 getImage()->getCCUtil()->params()),
192 getImage()->getCCUtil()->params()) {
193 dang_ambigs_table_ =
NULL;
194 replace_ambigs_table_ =
NULL;
195 keep_word_choices_ =
false;
196 reject_offset_ = 0.0;
197 best_raw_choice_ =
NULL;
202 last_word_on_line_ =
false;
203 hyphen_unichar_id_ = INVALID_UNICHAR_ID;
204 document_words_ =
NULL;
205 pending_words_ =
NULL;
209 max_fixed_length_dawgs_wdlen_ = -1;
210 wordseg_rating_adjust_factor_ = -1.0f;
211 output_ambig_words_file_ =
NULL;
215 if (hyphen_word_ !=
NULL)
delete hyphen_word_;
216 if (output_ambig_words_file_ !=
NULL) fclose(output_ambig_words_file_);
238 dawgs_ += punc_dawg_;
261 dawgs_ += freq_dawg_;
268 dawgs_ += unambig_dawg_;
302 dawgs_ += document_words_;
315 &dawgs_, &max_fixed_length_dawgs_wdlen_);
322 for (
int i = 0; i < dawgs_.
length(); ++i) {
323 const Dawg *dawg = dawgs_[i];
325 for (
int j = 0; j < dawgs_.
length(); ++j) {
326 const Dawg *other = dawgs_[j];
327 if (dawg !=
NULL && other !=
NULL &&
329 kDawgSuccessors[dawg->
type()][other->
type()]) *lst += j;
343 document_words_ =
NULL;
344 max_fixed_length_dawgs_wdlen_ = -1;
345 if (pending_words_ !=
NULL) {
346 delete pending_words_;
347 pending_words_ =
NULL;
358 for (
int i = 0; unichar_strings[i] != 0; i++) {
360 if (unichar_id != INVALID_UNICHAR_ID) {
369 for (
int i = 0; i < equivalent_symbols_.
size(); i++) {
370 if (equivalent_symbols_[i].contains(unichar_id)) {
371 return equivalent_symbols_[i][0];
382 bool word_end)
const {
386 tprintf(
"def_letter_is_okay: current unichar=%s word_end=%d"
387 " num active dawgs=%d num constraints=%d\n",
397 unichar_id == INVALID_UNICHAR_ID) {
403 PermuterType curr_perm = NO_PERM;
425 dawg_args, &curr_perm);
441 dawg->
edge_char_of(node, dawg_unichar_id, word_end) : NO_EDGE;
448 if (edge != NO_EDGE) {
450 word_end, dawg->
type())) {
457 "Append current dawg to updated active dawgs: ");
471 if (edge != NO_EDGE) {
474 "Recording constraint: ");
485 if (info.
ref == NO_EDGE) {
493 if (edge == NO_EDGE &&
504 for (
int s = 0; s < slist.
length(); ++s) {
505 int sdawg_index = slist[s];
506 const Dawg *sdawg = dawgs_[sdawg_index];
509 for (
int c = 0; c < constraints.
length(); ++c) {
512 const DawgInfo &cinfo = constraints[c];
517 if (snode == 0) snode = NO_EDGE;
527 if (sedge != NO_EDGE &&
529 dawgs_[sdawg_index]->type())) {
531 tprintf(
"Letter found in the successor dawg %d\n", sdawg_index);
537 "Append successor to updated active dawgs: ");
547 if (dawg_args->
permuter == NO_PERM || curr_perm == NO_PERM ||
548 (curr_perm != PUNC_PERM && dawg_args->
permuter != COMPOUND_PERM)) {
557 PermuterType *curr_perm)
const {
562 unichar_id_patterns.
push_back(unichar_id);
564 &unichar_id_patterns);
565 for (
int i = 0; i < unichar_id_patterns.
size(); ++i) {
568 for (
int k = 0; k < 2; ++k) {
570 dawg->
edge_char_of(node, unichar_id_patterns[i], word_end)
572 if (edge != NO_EDGE) {
578 word_end, dawg->
type())) {
585 "Append current dawg to updated active dawgs: ");
593 PermuterType perm,
int debug_level,
598 dawg_vec_copy.
move(dawg_vec);
600 fread(&num_dawgs,
sizeof(
inT32), 1, file);
602 if (swap) num_dawgs =
reverse32(num_dawgs);
604 int max_word_length = 0;
608 for (i = 0; i < num_dawgs; ++i) {
609 fread(&word_length,
sizeof(
inT32), 1, file);
610 if (swap) word_length =
reverse32(word_length);
613 (*dawg_vec)[word_length] =
615 if (word_length > max_word_length) max_word_length = word_length;
617 *max_wdlen = max_word_length;
621 for (i = 0; i < dawg_vec_copy.
size(); ++i) {
628 int num_dawgs,
int debug_level, FILE *output_file) {
629 fwrite(&num_dawgs,
sizeof(
inT32), 1, output_file);
630 if (debug_level)
tprintf(
"Writing %d split length dawgs\n", num_dawgs);
631 for (
int i = 1; i < dawg_vec.
size(); ++i) {
632 if ((dawg_vec)[i] !=
NULL) {
633 fwrite(&i,
sizeof(
inT32), 1, output_file);
634 dawg_vec[i]->write_squished_dawg(output_file);
635 if (debug_level)
tprintf(
"Wrote Dawg with word length %d\n", i);
645 bool ambigs_mode)
const {
647 if (sought_word_length != kAnyWordLength) {
649 if (sought_word_length <= max_fixed_length_dawgs_wdlen_ &&
650 dawgs_[sought_word_length] !=
NULL) {
651 *active_dawgs +=
DawgInfo(sought_word_length, NO_EDGE);
654 *active_dawgs = hyphen_active_dawgs_;
656 for (i = 0; i < hyphen_active_dawgs_.
size(); ++i) {
658 hyphen_active_dawgs_[i].dawg_index,
659 hyphen_active_dawgs_[i].ref);
663 for (i = 0; i < dawgs_.
length(); ++i) {
664 if (dawgs_[i] !=
NULL && kBeginningDawgsType[(dawgs_[i])->type()] &&
666 *active_dawgs +=
DawgInfo(i, NO_EDGE);
679 *constraints = hyphen_constraints_;
681 for (
int i = 0; i < hyphen_constraints_.
size(); ++i) {
683 hyphen_constraints_[i].dawg_index,
684 hyphen_constraints_[i].ref);
697 if (hyphen_word_)
return;
701 int stringlen = best_choice.
length();
708 if (best_choice.
length() >= kDocDictMaxRepChars) {
709 int num_rep_chars = 1;
711 for (
int i = 1; i < best_choice.
length(); ++i) {
717 if (num_rep_chars == kDocDictMaxRepChars)
return;
740 strcat(filename,
".doc");
741 doc_word_file =
open_file (filename,
"a");
742 fprintf(doc_word_file,
"%s\n",
744 fclose(doc_word_file);
750 float *certainty_array,
753 float additional_adjust,
755 bool is_han = (char_choices !=
NULL &&
762 float adjust_factor = additional_adjust;
763 float new_rating = word->
rating();
765 tprintf(
"%sWord: %s %4.2f ", nonword ?
"Non-" :
"",
768 new_rating += kRatingPad;
770 if (case_is_ok && punc_is_ok) {
772 new_rating *= adjust_factor;
776 new_rating *= adjust_factor;
778 if (!case_is_ok)
tprintf(
", C");
779 if (!punc_is_ok)
tprintf(
", P");
787 new_rating *= adjust_factor;
791 new_rating *= adjust_factor;
796 new_rating *= adjust_factor;
800 new_rating -= kRatingPad;
802 if (debug)
tprintf(
" %4.2f --> %4.2f\n", adjust_factor, new_rating);
803 LogNewChoice(adjust_factor, certainty_array,
false, word,
813 word_ptr = &temp_word;
815 if (word_ptr->
length() == 0)
return NO_PERM;
822 DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
823 &(active_dawgs[1]), &(constraints[1]),
824 0.0, NO_PERM, kAnyWordLength, 0);
825 int last_index = word_ptr->
length() - 1;
829 i == last_index)))
break;
843 delete[] active_dawgs;
844 delete[] constraints;
851 if (bigram_dawg_ ==
NULL)
return false;
855 int w1start, w1end, w2start, w2end;
861 if (w1start >= w1end)
return word1.
length() < 3;
862 if (w2start >= w2end)
return word2.
length() < 3;
866 for (
int i = w1start; i < w1end; i++) {
870 bigram_string +=
" ";
871 for (
int i = w2start; i < w2end; i++) {
880 if (word.
length() == 0)
return NO_PERM;
883 int last_index = word.
length() - 1;
885 for (i = 0; i <= last_index; ++i) {
888 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
892 }
else if ((new_len = new_word.length()) == 0 ||
897 for (i = 0; i < dawgs_.
size(); ++i) {
898 if (dawgs_[i] !=
NULL &&
900 dawgs_[i]->word_in_dawg(new_word))
return true;
911 int *sid =
new int[max_script];
913 for (x = 0; x < max_script; x++) sid[x] = 0;
914 for (x = 0; x < char_choices.
length(); ++x) {
915 BLOB_CHOICE_IT blob_choice_it(char_choices.
get(x));
916 sid[blob_choice_it.data()->script_id()]++;
932 for (x = 1; x < max_script; x++)
933 if (sid[x] >= sid[max_sid]) max_sid = x;
934 if (sid[max_sid] < char_choices.
length() / 2)
void delete_data_pointers()
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
static void ReadFixedLengthDawgs(DawgType type, const STRING &lang, PermuterType perm, int debug_level, FILE *file, DawgVector *dawg_vec, int *max_wdlen)
bool valid_punctuation(const WERD_CHOICE &word)
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
void set_rating(float new_val)
const char *const id_to_unichar(UNICHAR_ID id) const
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices, const UNICHARSET &unicharset)
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
double doc_dict_certainty_threshold
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
const STRING debug_string() const
void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
FILE * open_file(const char *filename, const char *mode)
FILE * GetDataFilePtr() const
void punct_stripped(int *start_core, int *end_core) const
void copy_hyphen_info(WERD_CHOICE *word) const
int max_permuter_attempts
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
bool get_ispunctuation(UNICHAR_ID unichar_id) const
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
const char * kApostropheLikeUTF8[]
const UNICHARSET & getUnicharset() const
bool CurrentWordAmbig()
Returns true if there are multiple good choices for the current word.
int get_script_table_size() const
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function that will be modified by various permuters.
TessdataManager tessdata_manager
#define STRING_INIT_MEMBER(name, val, comment, vec)
STRING language_data_path_prefix
bool add_unique(const DawgInfo &new_info, bool debug, const char *debug_msg)
bool permute_fixed_length_dawg
DawgInfoVector * updated_constraints
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
bool get_isupper(UNICHAR_ID unichar_id) const
#define INT_MEMBER(name, val, comment, vec)
bool get_isdigit(UNICHAR_ID unichar_id) const
const CCUtil * getCCUtil() const
double segment_reward_chartype
#define BOOL_INIT_MEMBER(name, val, comment, vec)
void initialize_patterns(UNICHARSET *unicharset)
double segment_reward_script
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
UNICHAR_ID NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const
bool permute_chartype_word
bool SeekToStart(TessdataType tessdata_type)
double segment_penalty_garbage
double segment_penalty_dict_frequent_word
PermuterType permuter() const
void init_active_dawgs(int sought_word_length, DawgInfoVector *active_dawgs, bool ambigs_mode) const
double segment_penalty_dict_nonword
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
static void WriteFixedLengthDawgs(const GenericVector< SquishedDawg * > &dawg_vec, int num_dawgs, int debug_level, FILE *output_file)
const STRING & lang() const
bool ConstraintsOk(const DawgInfoVector &constraints, int word_end, DawgType current_dawg_type) const
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
static const UNICHAR_ID kPatternUnicharID
DLLSYM uinT32 reverse32(uinT32 num)
void adjust_word(WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool nonword, float additional_adjust, bool debug)
Adjusts the rating of the given word.
const char * string() const
double segment_reward_ngram_best_choice
bool read_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
double doc_dict_pending_threshold
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
virtual bool end_of_word(EDGE_REF edge_ref) const =0
char * user_patterns_suffix
DLLSYM void tprintf(const char *format,...)
void LoadEquivalenceList(const char *unichar_strings[])
int def_letter_is_okay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
void init_constraints(DawgInfoVector *constraints) const
bool segment_segcost_rating
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
void move(GenericVector< T > *from)
DawgInfoVector * updated_active_dawgs
#define STRING_MEMBER(name, val, comment, vec)
DawgInfoVector * active_dawgs
double segment_penalty_dict_case_bad
bool load_fixed_length_dawgs
DawgInfoVector * constraints
double segment_penalty_dict_case_ok
GenericVector< int > SuccessorList
const char * kHyphenLikeUTF8[]
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
const UNICHARSET * unicharset() const
const UNICHAR_ID unichar_id(int index) const
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
#define BOOL_MEMBER(name, val, comment, vec)
#define double_MEMBER(name, val, comment, vec)
void LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[], bool raw_choice, WERD_CHOICE *WordChoice, const BLOB_CHOICE_LIST_VECTOR &blob_choices)
void set_permuter(uinT8 perm)
const Image * getImage() const
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)