19 #ifndef TESSERACT_DICT_DICT_H_
20 #define TESSERACT_DICT_DICT_H_
33 #define MAX_WERD_LENGTH (inT64) 128
52 static const int kAnyWordLength = -1;
53 static const int kRatingPad = 4;
56 static const char kHyphenSymbol[] =
"-";
57 static const int kMaxNumDawgEdgees = 2000000;
58 static const int kMaxDocDawgEdges = 250000;
59 static const int kMaxUserDawgEdges = 50000;
60 static const float kSimCertaintyScale = -10.0;
61 static const float kSimCertaintyOffset = -10.0;
62 static const float kSimilarityFloor = 100.0;
63 static const int kDocDictMaxRepChars = 4;
130 *word = *hyphen_word_;
145 return (last_word_on_line_ && !first_pos &&
146 unichar_id == hyphen_unichar_id_);
150 int word_index = word.
length() - 1;
168 if (word.
rating() < best_choice->
rating()) *best_choice = word;
181 return rating_limit <= 0.0;
190 int sought_word_length,
int end_char_choice_index);
206 bool word_ending,
WERD_CHOICE *word,
float certainties[],
207 float *limit,
WERD_CHOICE *best_choice,
int *attempts_left,
208 void *void_more_args);
218 const char*
choose_il1(
const char *first_char,
219 const char *second_char,
220 const char *third_char,
221 const char *prev_char,
222 const char *next_char,
223 const char *next_next_char);
262 char* pos_chartypes);
274 bool word_ending,
WERD_CHOICE *word,
float certainties[],
float *limit,
275 WERD_CHOICE *best_choice,
int *attempts_left,
void *more_args);
279 float curr_rating,
float curr_certainty,
281 const char *debug,
int word_ending,
286 int char_choice_index,
299 int char_choice_index,
310 int char_choice_index,
313 float certainties[],
float *limit,
315 void *void_more_args);
319 bool fix_replaceable,
321 bool *modified_blobs);
339 void ReplaceAmbig(
int wrong_ngram_begin_index,
int wrong_ngram_size,
342 bool *modified_blobs);
358 const float Certainties[]);
367 const char *String_lengths,
383 bool *modified_blobs);
399 const float Certainties[],
414 int label_num_unichars);
418 FLOAT32 AdjustFactor,
const float Certainties[],
479 if (pending_words_ !=
NULL)
480 pending_words_->
clear();
481 if (document_words_ !=
NULL)
482 document_words_->
clear();
574 int character_bytes);
579 int character_bytes) {
582 context, context_bytes,
588 const char*
lang,
const char* context,
int context_bytes,
589 const char*
character,
int character_bytes) {
591 (void) context_bytes;
593 (void) character_bytes;
600 int character_bytes);
605 inline const Dawg *
GetDawg(
int index)
const {
return dawgs_[index]; }
612 if (word_length > max_fixed_length_dawgs_wdlen_)
return NULL;
613 assert(dawgs_.
size() > word_length);
614 return dawgs_[word_length];
617 return max_fixed_length_dawgs_wdlen_;
621 if (edge_ref == NO_EDGE)
return 0;
623 if (node == 0) node = NO_EDGE;
632 int word_end,
DawgType current_dawg_type)
const {
633 if (!word_end)
return true;
635 for (
int c = 0; c < constraints.
length(); ++c) {
636 const DawgInfo &cinfo = constraints[c];
657 PermuterType *current_permuter)
const;
669 PermuterType perm,
int debug_level,
676 int num_dawgs,
int debug_level, FILE *output_file);
680 return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
681 perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
682 perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM));
712 bool nonword,
float additional_adjust,
bool debug);
716 adjust_word(word, certainty_array, char_choices,
false, 0.0
f,
720 float *certainty_array,
723 adjust_word(word, certainty_array, char_choices,
true, 0.0
f, debug);
727 wordseg_rating_adjust_factor_ =
f;
748 bool keep_word_choices_;
762 bool last_word_on_line_;
771 Trie *pending_words_;
785 Trie *document_words_;
788 int max_fixed_length_dawgs_wdlen_;
791 float wordseg_rating_adjust_factor_;
793 FILE *output_ambig_words_file_;
801 "A list of user-provided patterns.");
806 "Load dawg with punctuation patterns.");
809 " dawgs (e.g. for non-space delimited languages)");
811 "Load dawg with special word bigrams.");
813 "Score multiplier for word matches which have good case and"
814 "are frequent in the given language (lower is better).");
817 "Score multiplier for word matches that have good case "
818 "(lower is better).");
821 "Default score multiplier for word matches, which may have "
822 "case issues (lower is better).");
826 "Multipler to for the best choice from the ngram model.");
829 "Score multiplier for glyph fragment segmentations which "
830 "do not match a dictionary word (lower is better).");
833 "Score multiplier for poorly cased strings that are not in"
834 " the dictionary and generally look like garbage (lower is"
837 "Output file for ambiguities found in the dictionary");
839 ", to 2 for more details, to 3 to see all the debug messages");
843 "Use only the first UTF8 step of the given string"
844 " when computing log probabilities.");
847 "Certainty threshold for non-dict words");
849 "Reject certainty offset");
851 "Size of dict word to be treated as non-dict word");
853 "Certainty to add for each dict char above small word size.");
855 "Max certaintly variation allowed in a word (in sigma)");
858 "Make AcceptableChoice() always return false. Useful"
859 " when there is a need to explore all segmentations");
861 "Gain factor for ambiguity threshold.");
863 "Certainty offset for ambiguity threshold.");
867 " should be printed to stdout");
869 "Lengths of unichars in word_to_debug");
874 " current best rate to prune other hypotheses");
876 "Turn on word script consistency permuter");
878 "incorporate segmentation cost in word rating?");
880 "Don't use any alphabetic-specific tricks."
881 "Set to true in the traineddata config file for"
882 " scripts that are cursive or inherently fixed-pitch");
884 "Score multipler for script consistency within a word. "
885 "Being a 'reward' factor, it should be <= 1. "
886 "Smaller value implies bigger reward.");
888 "Turn on fixed-length phrasebook search permuter");
890 "Turn on character type (property) consistency permuter");
892 "Score multipler for char type consistency within a word. ");
895 "Score multipler for ngram permuter's best choice"
896 " (only used in the Han script path).");
900 "Worst certainty for using pending dictionary");
902 " for words that can be inserted into the document dictionary");
904 "Activate character-level n-gram-based permuter");
906 " character choices to consider during permutation."
907 " This limit is especially useful when user patterns"
908 " are specified, since overly generic patterns can result in"
909 " dawg search exploring an overly large number of options.");
914 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_
bool AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices, WERD_CHOICE *BestChoice, DANGERR *fixpt, ACCEPTABLE_CHOICE_CALLER caller, bool *modified_blobs)
Returns true if the given best_choice is good enough to stop.
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
static void ReadFixedLengthDawgs(DawgType type, const STRING &lang, PermuterType perm, int debug_level, FILE *file, DawgVector *dawg_vec, int *max_wdlen)
void go_deeper_top_fragments_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
bool valid_punctuation(const WERD_CHOICE &word)
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
void EndDangerousAmbigs()
double StopperAmbigThreshold(double f1, double f2)
void FillViableChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[], VIABLE_CHOICE ViableChoice)
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices, const UNICHARSET &unicharset)
bool use_only_first_uft8_step
double doc_dict_certainty_threshold
double ngram_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
double stopper_phase2_certainty_rejection_offset
const LIST & getBestChoices()
GenericVector< Dawg * > DawgVector
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice)
Returns the length of the shortest alpha run in WordChoice.
int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice)
const int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
bool ambigs_mode(float rating_limit)
Returns true if we are operating in ambigs mode.
void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
char top_word_chartype(const BLOB_CHOICE_LIST_VECTOR &char_choices, char *pos_chartypes)
double segment_reward_chartype
#define BOOL_VAR_H(name, val, comment)
int valid_word_or_number(const WERD_CHOICE &word) const
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Semi-generic functions used by multiple permuters.
const Dawg * GetFixedLengthDawg(int word_length) const
Return the pointer to the Dawg that contains words of length word_length.
const int GetMaxFixedLengthDawgIndex() const
int LetterIsOkay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
WERD_CHOICE * top_fragments_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
const char * choose_il1(const char *first_char, const char *second_char, const char *third_char, const char *prev_char, const char *next_char, const char *next_next_char)
void copy_hyphen_info(WERD_CHOICE *word) const
bool segment_segcost_rating
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
WERD_CHOICE * permute_fixed_length_words(const BLOB_CHOICE_LIST_VECTOR &char_choices, PermuterState *permuter_state)
const UNICHARSET & getUnicharset() const
void AddNewChunk(VIABLE_CHOICE Choice, int Blob)
bool CurrentWordAmbig()
Returns true if there are multiple good choices for the current word.
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function that will be modified by various permuters.
int tessedit_truncate_wordchoice_log
void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice)
Dumps a text representation of the specified Choice to File.
DawgInfoVector * updated_constraints
int max_permuter_attempts
void ClearBestChoiceAccum()
Clears best_choices_ list accumulated by the stopper.
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
void ResetDocumentDictionary()
const CCUtil * getCCUtil() const
int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices, const WERD_CHOICE &BestChoice)
void incorporate_segcost(WERD_CHOICE *word)
Incoporate segmentation cost into word rating.
UNICHAR_ID NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const
#define double_VAR_H(name, val, comment)
bool StringSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice)
double segment_penalty_garbage
WERD_CHOICE * permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices, const WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice)
double segment_penalty_dict_frequent_word
bool segment_nonalphabetic_script
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
#define STRING_VAR_H(name, val, comment)
void remove_unichar_ids(int index, int num)
int FreeBadChoice(void *item1, void *item2)
int good_choice(const WERD_CHOICE &choice)
Returns true if a good answer is found for the unknown blob rating.
DawgArgs(DawgInfoVector *d, DawgInfoVector *c, DawgInfoVector *ud, DawgInfoVector *uc, float r, PermuterType p, int len, int e)
void init_active_dawgs(int sought_word_length, DawgInfoVector *active_dawgs, bool ambigs_mode) const
float rating_array[MAX_WERD_LENGTH]
double segment_penalty_dict_nonword
VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[])
char * output_ambig_words_file
FLOAT32 CurrentBestChoiceAdjustFactor()
Returns the adjustment factor for the best choice for the current word.
int valid_word(const WERD_CHOICE &word) const
WERD_CHOICE * permute_compound_words(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
bool ngram_permuter_activated
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
int max_viterbi_list_size
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, BLOB_CHOICE_LIST_VECTOR *Choices, bool *modified_blobs)
bool AlternativeChoicesWorseThan(FLOAT32 Threshold)
static void WriteFixedLengthDawgs(const GenericVector< SquishedDawg * > &dawg_vec, int num_dawgs, int debug_level, FILE *output_file)
double segment_penalty_ngram_best_choice
bool ConstraintsOk(const DawgInfoVector &constraints, int word_end, DawgType current_dawg_type) const
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
double stopper_allowable_character_badness
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
double stopper_ambiguity_threshold_offset
void adjust_word(WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool nonword, float additional_adjust, bool debug)
Adjusts the rating of the given word.
bool permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices, WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice)
char * word_to_debug_lengths
const char * string() const
bool CurrentBestChoiceIs(const WERD_CHOICE &WordChoice)
Returns true if WordChoice is the same as the current best choice.
int stopper_smallword_size
WERD_CHOICE * permute_top_choice(const BLOB_CHOICE_LIST_VECTOR &char_choices, float *rating_limit, WERD_CHOICE *raw_choice, BOOL8 *any_alpha)
double doc_dict_pending_threshold
double segment_reward_ngram_best_choice
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
WERD_CHOICE * permute_chartype_words(const BLOB_CHOICE_LIST_VECTOR &char_choices, PermuterState *permuter_state)
checks for consistency in character property (eg. alpah, digit, punct)
void FindClassifierErrors(FLOAT32 MinRating, FLOAT32 MaxRating, FLOAT32 RatingMargin, FLOAT32 Thresholds[])
int end_char_choice_index
double stopper_certainty_per_char
virtual bool end_of_word(EDGE_REF edge_ref) const =0
void reset_hyphen_vars(bool last_word_on_line)
bool AcceptableResult(const WERD_CHOICE &BestChoice)
double stopper_nondict_certainty_base
char * user_patterns_suffix
DLLSYM void tprintf(const char *format,...)
void LoadEquivalenceList(const char *unichar_strings[])
void DisableChoiceAccum()
WERD_CHOICE * permute_script_words(const BLOB_CHOICE_LIST_VECTOR &char_choices, PermuterState *permuter_state)
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
int def_letter_is_okay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
void adjust_non_word(WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool debug)
void LogNewSplit(int Blob)
bool permute_fixed_length_dawg
void init_constraints(DawgInfoVector *constraints) const
WERD_CHOICE * get_top_choice_word(const BLOB_CHOICE_LIST_VECTOR &char_choices)
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset)
DawgInfoVector * updated_active_dawgs
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, BLOB_CHOICE_LIST_VECTOR *blob_choices, bool *modified_blobs)
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
void remove_hyphen_head(WERD_CHOICE *word) const
void PrintAmbigAlternatives(FILE *file, const char *label, int label_num_unichars)
Print all the choices in raw_choices_ list for non 1-1 ambiguities.
DawgInfoVector * active_dawgs
double stopper_ambiguity_threshold_gain
bool ChoiceAccumEnabled()
void adjust_word(WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool debug)
double segment_penalty_dict_case_bad
void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, int start, int end, WERD_CHOICE *current_word)
#define INT_VAR_H(name, val, comment)
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
bool load_fixed_length_dawgs
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
DawgInfoVector * constraints
double segment_penalty_dict_case_ok
bool permute_chartype_word
bool stopper_no_acceptable_choices
void LogNewSegmentation(PIECES_STATE BlobWidth)
void DebugWordChoices()
Prints the current choices for this word to stdout.
void set_hyphen_word(const WERD_CHOICE &word, const DawgInfoVector &active_dawgs, const DawgInfoVector &constraints)
double segment_reward_script
uinT8 PIECES_STATE[MAX_NUM_CHUNKS+2]
UNICHARSET & getUnicharset()
const UnicharAmbigs & getUnicharAmbigs()
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, int sought_word_length, int end_char_choice_index)
double bestrate_pruning_factor
const CHAR_FRAGMENT * fragment
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
const UNICHAR_ID unichar_id(int index) const
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
UnicharAmbigs unichar_ambigs
bool compound_marker(UNICHAR_ID unichar_id)
void LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[], bool raw_choice, WERD_CHOICE *WordChoice, const BLOB_CHOICE_LIST_VECTOR &blob_choices)
const Image * getImage() const