Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::Dict Class Reference

#include <dict.h>

Public Member Functions

 Dict (Image *image_ptr)
 
 ~Dict ()
 
const ImagegetImage () const
 
ImagegetImage ()
 
const UNICHARSETgetUnicharset () const
 
UNICHARSETgetUnicharset ()
 
const UnicharAmbigsgetUnicharAmbigs ()
 
bool compound_marker (UNICHAR_ID unichar_id)
 
bool hyphenated () const
 Returns true if we've recorded the beginning of a hyphenated word. More...
 
int hyphen_base_size () const
 Size of the base word (the part on the line before) of a hyphenated word. More...
 
void copy_hyphen_info (WERD_CHOICE *word) const
 
void remove_hyphen_head (WERD_CHOICE *word) const
 
bool has_hyphen_end (UNICHAR_ID unichar_id, bool first_pos) const
 Check whether the word has a hyphen at the end. More...
 
bool has_hyphen_end (const WERD_CHOICE &word) const
 Same as above, but check the unichar at the end of the word. More...
 
void reset_hyphen_vars (bool last_word_on_line)
 
void set_hyphen_word (const WERD_CHOICE &word, const DawgInfoVector &active_dawgs, const DawgInfoVector &constraints)
 
void update_best_choice (const WERD_CHOICE &word, WERD_CHOICE *best_choice)
 
void init_active_dawgs (int sought_word_length, DawgInfoVector *active_dawgs, bool ambigs_mode) const
 
void init_constraints (DawgInfoVector *constraints) const
 
bool ambigs_mode (float rating_limit)
 Returns true if we are operating in ambigs mode. More...
 
WERD_CHOICEdawg_permute_and_select (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
 
WERD_CHOICEget_top_choice_word (const BLOB_CHOICE_LIST_VECTOR &char_choices)
 
WERD_CHOICEpermute_top_choice (const BLOB_CHOICE_LIST_VECTOR &char_choices, float *rating_limit, WERD_CHOICE *raw_choice, BOOL8 *any_alpha)
 
WERD_CHOICEpermute_all (const BLOB_CHOICE_LIST_VECTOR &char_choices, const WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice)
 
void end_permute ()
 
void permute_subword (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, int start, int end, WERD_CHOICE *current_word)
 
bool permute_characters (const BLOB_CHOICE_LIST_VECTOR &char_choices, WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice)
 
WERD_CHOICEpermute_compound_words (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
 
WERD_CHOICEpermute_fixed_length_words (const BLOB_CHOICE_LIST_VECTOR &char_choices, PermuterState *permuter_state)
 
void incorporate_segcost (WERD_CHOICE *word)
 Incoporate segmentation cost into word rating. More...
 
WERD_CHOICEpermute_script_words (const BLOB_CHOICE_LIST_VECTOR &char_choices, PermuterState *permuter_state)
 
WERD_CHOICEpermute_chartype_words (const BLOB_CHOICE_LIST_VECTOR &char_choices, PermuterState *permuter_state)
 checks for consistency in character property (eg. alpah, digit, punct) More...
 
char top_word_chartype (const BLOB_CHOICE_LIST_VECTOR &char_choices, char *pos_chartypes)
 
bool NoDangerousAmbig (WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, BLOB_CHOICE_LIST_VECTOR *Choices, bool *modified_blobs)
 
double StopperAmbigThreshold (double f1, double f2)
 
int FreeBadChoice (void *item1, void *item2)
 
void ReplaceAmbig (int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, BLOB_CHOICE_LIST_VECTOR *blob_choices, bool *modified_blobs)
 
void DisableChoiceAccum ()
 
void EnableChoiceAccum ()
 
bool ChoiceAccumEnabled ()
 
int LengthOfShortestAlphaRun (const WERD_CHOICE &WordChoice)
 Returns the length of the shortest alpha run in WordChoice. More...
 
VIABLE_CHOICE NewViableChoice (const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[])
 
void PrintViableChoice (FILE *File, const char *Label, VIABLE_CHOICE Choice)
 Dumps a text representation of the specified Choice to File. More...
 
bool StringSameAs (const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice)
 
bool StringSameAs (const char *String, const char *String_lengths, VIABLE_CHOICE ViableChoice)
 Compares String to ViableChoice and returns true if they are the same. More...
 
int UniformCertainties (const BLOB_CHOICE_LIST_VECTOR &Choices, const WERD_CHOICE &BestChoice)
 
bool AcceptableChoice (BLOB_CHOICE_LIST_VECTOR *Choices, WERD_CHOICE *BestChoice, DANGERR *fixpt, ACCEPTABLE_CHOICE_CALLER caller, bool *modified_blobs)
 Returns true if the given best_choice is good enough to stop. More...
 
bool AcceptableResult (const WERD_CHOICE &BestChoice)
 
int ChoiceSameAs (const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice)
 
void LogNewChoice (FLOAT32 AdjustFactor, const float Certainties[], bool raw_choice, WERD_CHOICE *WordChoice, const BLOB_CHOICE_LIST_VECTOR &blob_choices)
 
void EndDangerousAmbigs ()
 
bool CurrentBestChoiceIs (const WERD_CHOICE &WordChoice)
 Returns true if WordChoice is the same as the current best choice. More...
 
FLOAT32 CurrentBestChoiceAdjustFactor ()
 Returns the adjustment factor for the best choice for the current word. More...
 
bool CurrentWordAmbig ()
 Returns true if there are multiple good choices for the current word. More...
 
void DebugWordChoices ()
 Prints the current choices for this word to stdout. More...
 
void PrintAmbigAlternatives (FILE *file, const char *label, int label_num_unichars)
 Print all the choices in raw_choices_ list for non 1-1 ambiguities. More...
 
void FillViableChoice (const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[], VIABLE_CHOICE ViableChoice)
 
bool AlternativeChoicesWorseThan (FLOAT32 Threshold)
 
void FilterWordChoices ()
 
void FindClassifierErrors (FLOAT32 MinRating, FLOAT32 MaxRating, FLOAT32 RatingMargin, FLOAT32 Thresholds[])
 
void InitChoiceAccum ()
 
void ClearBestChoiceAccum ()
 Clears best_choices_ list accumulated by the stopper. More...
 
void LogNewSegmentation (PIECES_STATE BlobWidth)
 
void LogNewSplit (int Blob)
 
void AddNewChunk (VIABLE_CHOICE Choice, int Blob)
 
void SettupStopperPass1 ()
 Sets up stopper variables in preparation for the first pass. More...
 
void SettupStopperPass2 ()
 Sets up stopper variables in preparation for the second pass. More...
 
int case_ok (const WERD_CHOICE &word, const UNICHARSET &unicharset)
 Check a string to see if it matches a set of lexical rules. More...
 
bool absolute_garbage (const WERD_CHOICE &word, const UNICHARSET &unicharset)
 
void Load ()
 
void End ()
 
void ResetDocumentDictionary ()
 
void LoadEquivalenceList (const char *unichar_strings[])
 
UNICHAR_ID NormalizeUnicharIdForMatch (UNICHAR_ID unichar_id) const
 
int def_letter_is_okay (void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
 
int LetterIsOkay (void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
 Calls letter_is_okay_ member function. More...
 
double ProbabilityInContext (const char *context, int context_bytes, const char *character, int character_bytes)
 Calls probability_in_context_ member function. More...
 
double def_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Default (no-op) implementation of probability in context function. More...
 
double ngram_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 
const int NumDawgs () const
 Return the number of dawgs in the dawgs_ vector. More...
 
const DawgGetDawg (int index) const
 Return i-th dawg pointer recorded in the dawgs_ vector. More...
 
const DawgGetPuncDawg () const
 Return the points to the punctuation dawg. More...
 
const DawgGetUnambigDawg () const
 Return the points to the unambiguous words dawg. More...
 
const DawgGetFixedLengthDawg (int word_length) const
 Return the pointer to the Dawg that contains words of length word_length. More...
 
const int GetMaxFixedLengthDawgIndex () const
 
bool ConstraintsOk (const DawgInfoVector &constraints, int word_end, DawgType current_dawg_type) const
 
void ProcessPatternEdges (const Dawg *dawg, const DawgInfo &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
 
int valid_word (const WERD_CHOICE &word, bool numbers_ok) const
 
int valid_word (const WERD_CHOICE &word) const
 
int valid_word_or_number (const WERD_CHOICE &word) const
 
int valid_word (const char *string) const
 This function is used by api/tesseract_cube_combiner.cpp. More...
 
bool valid_bigram (const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
 
bool valid_punctuation (const WERD_CHOICE &word)
 
int good_choice (const WERD_CHOICE &choice)
 Returns true if a good answer is found for the unknown blob rating. More...
 
void add_document_word (const WERD_CHOICE &best_choice)
 Adds a word found on this document to the document specific dictionary. More...
 
int get_top_word_script (const BLOB_CHOICE_LIST_VECTOR &char_choices, const UNICHARSET &unicharset)
 
void adjust_word (WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool nonword, float additional_adjust, bool debug)
 Adjusts the rating of the given word. More...
 
void adjust_word (WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool debug)
 
void adjust_non_word (WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool debug)
 
void SetWordsegRatingAdjustFactor (float f)
 Set wordseg_rating_adjust_factor_ to the given value. More...
 
const LISTgetBestChoices ()
 
go_deeper_dawg_fxn

If the choice being composed so far could be a dictionary word keep exploring choices.

There are two modes for deciding whether to go deeper: regular dawg permuter mode and the special ambigs mode. If *limit is <= 0.0 the function switches to the ambigs mode (this is the case when dawg_permute_and_select() function is called from NoDangerousAmbigs()) and only searches for the first choice that has a rating better than *limit (in this case ratings are fake, since the real ratings can not be < 0). Modification of the hyphen state is turned off in the ambigs mode. When in the regular dawg permuter mode, the function explores all the possible words and chooses the one with the best rating. The letters with ratings that are far worse than the ones seen so far are pruned out.

WERD_CHOICEdawg_permute_and_select (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, int sought_word_length, int end_char_choice_index)
 
void go_deeper_dawg_fxn (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 
choose_il1

Choose between the candidate il1 chars.

Parameters
first_charfirst choice
second_charsecond choice
third_charthird choice
prev_charprev in word
next_charnext in word
next_next_charafter next next in word
const char * choose_il1 (const char *first_char, const char *second_char, const char *third_char, const char *prev_char, const char *next_char, const char *next_next_char)
 
fragment_state

Given the current char choice and information about previously seen fragments, determines whether adjacent character fragments are present and whether they can be concatenated.

The given prev_char_frag_info contains:

  • fragment: if not NULL contains information about immediately preceeding fragmented character choice
  • num_fragments: number of fragments that have been used so far to construct a character
  • certainty: certainty of the current choice or minimum certainty of all fragments concatenated so far
  • rating: rating of the current choice or sum of fragment ratings concatenated so far

The output char_frag_info is filled in as follows:

  • character: is set to be NULL if the choice is a non-matching or non-ending fragment piece; is set to unichar of the given choice if it represents a regular character or a matching ending fragment
  • fragment,num_fragments,certainty,rating are set as described above
Returns
false if a non-matching fragment is discovered, true otherwise.
WERD_CHOICEtop_fragments_permute_and_select (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
 
void go_deeper_top_fragments_fxn (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
bool fragment_state_okay (UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
 Semi-generic functions used by multiple permuters. More...
 
void permute_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
void append_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 

Static Public Member Functions

static NODE_REF GetStartingNode (const Dawg *dawg, EDGE_REF edge_ref)
 Returns the appropriate next node given the EDGE_REF. More...
 
static void ReadFixedLengthDawgs (DawgType type, const STRING &lang, PermuterType perm, int debug_level, FILE *file, DawgVector *dawg_vec, int *max_wdlen)
 
static void WriteFixedLengthDawgs (const GenericVector< SquishedDawg * > &dawg_vec, int num_dawgs, int debug_level, FILE *output_file)
 
static bool valid_word_permuter (uinT8 perm, bool numbers_ok)
 Check all the DAWGs to see if this word is in any of them. More...
 

Public Attributes

void(Dict::* go_deeper_fxn_ )(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 Pointer to go_deeper function that will be modified by various permuters. More...
 
int(Dict::* letter_is_okay_ )(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
 
double(Dict::* probability_in_context_ )(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Probability in context function used by the ngram permuter. More...
 
char * user_words_suffix = ""
 
char * user_patterns_suffix = ""
 
bool load_system_dawg = true
 
bool load_freq_dawg = true
 
bool load_unambig_dawg = true
 
bool load_punc_dawg = true
 
bool load_number_dawg = true
 
bool load_fixed_length_dawgs = true
 
bool load_bigram_dawg = false
 
double segment_penalty_dict_frequent_word = 1.0
 
double segment_penalty_dict_case_ok = 1.1
 
double segment_penalty_dict_case_bad = 1.3125
 
double segment_penalty_ngram_best_choice = 1.24
 
double segment_penalty_dict_nonword = 1.25
 
double segment_penalty_garbage = 1.50
 
char * output_ambig_words_file = ""
 
int dawg_debug_level = 0
 
int hyphen_debug_level = 0
 
int max_viterbi_list_size = 10
 
bool use_only_first_uft8_step = false
 
double certainty_scale = 20.0
 
double stopper_nondict_certainty_base = -2.50
 
double stopper_phase2_certainty_rejection_offset = 1.0
 
int stopper_smallword_size = 2
 
double stopper_certainty_per_char = -0.50
 
double stopper_allowable_character_badness = 3.0
 
int stopper_debug_level = 0
 
bool stopper_no_acceptable_choices = false
 
double stopper_ambiguity_threshold_gain = 8.0
 
double stopper_ambiguity_threshold_offset = 1.5
 
bool save_raw_choices = false
 
int tessedit_truncate_wordchoice_log = 10
 
char * word_to_debug = ""
 
char * word_to_debug_lengths = ""
 
int fragments_debug = 0
 
int segment_debug = 0
 
bool permute_debug = 0
 
double bestrate_pruning_factor = 2.0
 
bool permute_script_word = 0
 
bool segment_segcost_rating = 0
 
bool segment_nonalphabetic_script = false
 
double segment_reward_script = 0.95
 
bool permute_fixed_length_dawg = 0
 
bool permute_chartype_word = 0
 
double segment_reward_chartype = 0.97
 
double segment_reward_ngram_best_choice = 0.99
 
bool save_doc_words = 0
 
bool doc_dict_enable = 1
 
double doc_dict_pending_threshold = 0.0
 
double doc_dict_certainty_threshold = -2.25
 
bool ngram_permuter_activated = false
 
int max_permuter_attempts = 10000
 
bool permute_only_top = false
 

Detailed Description

Definition at line 90 of file dict.h.

Constructor & Destructor Documentation

tesseract::Dict::Dict ( Image image_ptr)

Definition at line 33 of file dict.cpp.

36  image_ptr_(image_ptr),
38  "A list of user-provided words.",
39  getImage()->getCCUtil()->params()),
41  "A list of user-provided patterns.",
42  getImage()->getCCUtil()->params()),
43  BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
44  getImage()->getCCUtil()->params()),
45  BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
46  getImage()->getCCUtil()->params()),
47  BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
48  getImage()->getCCUtil()->params()),
49  BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation"
50  " patterns.", getImage()->getCCUtil()->params()),
51  BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number"
52  " patterns.", getImage()->getCCUtil()->params()),
53  BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs"
54  " (e.g. for non-space delimited languages)",
55  getImage()->getCCUtil()->params()),
56  BOOL_INIT_MEMBER(load_bigram_dawg, false, "Load dawg with special word "
57  "bigrams.", getImage()->getCCUtil()->params()),
59  "Score multiplier for word matches which have good case and"
60  "are frequent in the given language (lower is better).",
61  getImage()->getCCUtil()->params()),
63  "Score multiplier for word matches that have good case "
64  "(lower is better).", getImage()->getCCUtil()->params()),
66  "Default score multiplier for word matches, which may have "
67  "case issues (lower is better).",
68  getImage()->getCCUtil()->params()),
70  "Multipler to for the best choice from the ngram model.",
71  getImage()->getCCUtil()->params()),
73  "Score multiplier for glyph fragment segmentations which "
74  "do not match a dictionary word (lower is better).",
75  getImage()->getCCUtil()->params()),
77  "Score multiplier for poorly cased strings that are not in"
78  " the dictionary and generally look like garbage (lower is"
79  " better).", getImage()->getCCUtil()->params()),
81  "Output file for ambiguities found in the dictionary",
82  getImage()->getCCUtil()->params()),
83  INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info"
84  ", to 2 for more details, to 3 to see all the debug messages",
85  getImage()->getCCUtil()->params()),
86  INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
87  getImage()->getCCUtil()->params()),
88  INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.",
89  getImage()->getCCUtil()->params()),
91  "Use only the first UTF8 step of the given string"
92  " when computing log probabilities.",
93  getImage()->getCCUtil()->params()),
94  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
95  getImage()->getCCUtil()->params()),
97  "Certainty threshold for non-dict words",
98  getImage()->getCCUtil()->params()),
100  "Reject certainty offset",
101  getImage()->getCCUtil()->params()),
103  "Size of dict word to be treated as non-dict word",
104  getImage()->getCCUtil()->params()),
105  double_MEMBER(stopper_certainty_per_char, -0.50, "Certainty to add"
106  " for each dict char above small word size.",
107  getImage()->getCCUtil()->params()),
109  "Max certaintly variation allowed in a word (in sigma)",
110  getImage()->getCCUtil()->params()),
111  INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
112  getImage()->getCCUtil()->params()),
114  "Make AcceptableChoice() always return false. Useful"
115  " when there is a need to explore all segmentations",
116  getImage()->getCCUtil()->params()),
118  "Gain factor for ambiguity threshold.",
119  getImage()->getCCUtil()->params()),
121  "Certainty offset for ambiguity threshold.",
122  getImage()->getCCUtil()->params()),
123  BOOL_MEMBER(save_raw_choices, false, "Save all explored raw choices",
124  getImage()->getCCUtil()->params()),
126  "Max words to keep in list",
127  getImage()->getCCUtil()->params()),
128  STRING_MEMBER(word_to_debug, "", "Word for which stopper debug"
129  " information should be printed to stdout",
130  getImage()->getCCUtil()->params()),
132  "Lengths of unichars in word_to_debug",
133  getImage()->getCCUtil()->params()),
134  INT_MEMBER(fragments_debug, 0, "Debug character fragments",
135  getImage()->getCCUtil()->params()),
136  INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process",
137  getImage()->getCCUtil()->params()),
138  BOOL_MEMBER(permute_debug, 0, "Debug char permutation process",
139  getImage()->getCCUtil()->params()),
140  double_MEMBER(bestrate_pruning_factor, 2.0, "Multiplying factor of"
141  " current best rate to prune other hypotheses",
142  getImage()->getCCUtil()->params()),
144  "Turn on word script consistency permuter",
145  getImage()->getCCUtil()->params()),
147  "incorporate segmentation cost in word rating?",
148  getImage()->getCCUtil()->params()),
150  "Don't use any alphabetic-specific tricks."
151  "Set to true in the traineddata config file for"
152  " scripts that are cursive or inherently fixed-pitch",
153  getImage()->getCCUtil()->params()),
155  "Score multipler for script consistency within a word. "
156  "Being a 'reward' factor, it should be <= 1. "
157  "Smaller value implies bigger reward.",
158  getImage()->getCCUtil()->params()),
160  "Turn on fixed-length phrasebook search permuter",
161  getImage()->getCCUtil()->params()),
163  "Turn on character type (property) consistency permuter",
164  getImage()->getCCUtil()->params()),
166  "Score multipler for char type consistency within a word. ",
167  getImage()->getCCUtil()->params()),
169  "Score multipler for ngram permuter's best choice"
170  " (only used in the Han script path).",
171  getImage()->getCCUtil()->params()),
172  BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
173  getImage()->getCCUtil()->params()),
174  BOOL_MEMBER(doc_dict_enable, 1, "Enable Document Dictionary ",
175  getImage()->getCCUtil()->params()),
177  "Worst certainty for using pending dictionary",
178  getImage()->getCCUtil()->params()),
180  "Worst certainty for words that can be inserted into the"
181  "document dictionary", getImage()->getCCUtil()->params()),
183  "Activate character-level n-gram-based permuter",
184  getImage()->getCCUtil()->params()),
185  INT_MEMBER(max_permuter_attempts, 10000, "Maximum number of different"
186  " character choices to consider during permutation."
187  " This limit is especially useful when user patterns"
188  " are specified, since overly generic patterns can result in"
189  " dawg search exploring an overly large number of options.",
190  getImage()->getCCUtil()->params()),
191  BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter",
192  getImage()->getCCUtil()->params()) {
193  dang_ambigs_table_ = NULL;
194  replace_ambigs_table_ = NULL;
195  keep_word_choices_ = false;
196  reject_offset_ = 0.0;
197  best_raw_choice_ = NULL;
198  best_choices_ = NIL_LIST;
199  raw_choices_ = NIL_LIST;
201  hyphen_word_ = NULL;
202  last_word_on_line_ = false;
203  hyphen_unichar_id_ = INVALID_UNICHAR_ID;
204  document_words_ = NULL;
205  pending_words_ = NULL;
206  bigram_dawg_ = NULL;
207  freq_dawg_ = NULL;
208  punc_dawg_ = NULL;
209  max_fixed_length_dawgs_wdlen_ = -1;
210  wordseg_rating_adjust_factor_ = -1.0f;
211  output_ambig_words_file_ = NULL;
212 }
bool use_only_first_uft8_step
Definition: dict.h:844
char * user_words_suffix
Definition: dict.h:799
double doc_dict_certainty_threshold
Definition: dict.h:902
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:849
bool load_bigram_dawg
Definition: dict.h:811
bool doc_dict_enable
Definition: dict.h:898
double segment_reward_chartype
Definition: dict.h:892
bool load_number_dawg
Definition: dict.h:807
bool load_unambig_dawg
Definition: dict.h:804
#define NIL_LIST
Definition: oldlist.h:126
bool segment_segcost_rating
Definition: dict.h:878
#define NULL
Definition: host.h:144
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function that will be modified by various permuters.
Definition: dict.h:308
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:290
bool permute_debug
Definition: dict.h:872
double certainty_scale
Definition: dict.h:845
int tessedit_truncate_wordchoice_log
Definition: dict.h:865
int max_permuter_attempts
Definition: dict.h:909
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:272
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:287
double segment_penalty_garbage
Definition: dict.h:835
double segment_penalty_dict_frequent_word
Definition: dict.h:814
bool segment_nonalphabetic_script
Definition: dict.h:882
int segment_debug
Definition: dict.h:871
int hyphen_debug_level
Definition: dict.h:840
double segment_penalty_dict_nonword
Definition: dict.h:830
char * output_ambig_words_file
Definition: dict.h:837
bool ngram_permuter_activated
Definition: dict.h:904
int max_viterbi_list_size
Definition: dict.h:841
double segment_penalty_ngram_best_choice
Definition: dict.h:826
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:587
double stopper_allowable_character_badness
Definition: dict.h:855
double stopper_ambiguity_threshold_offset
Definition: dict.h:863
int dawg_debug_level
Definition: dict.h:839
bool load_system_dawg
Definition: dict.h:802
char * word_to_debug_lengths
Definition: dict.h:869
int stopper_smallword_size
Definition: dict.h:851
double doc_dict_pending_threshold
Definition: dict.h:900
double segment_reward_ngram_best_choice
Definition: dict.h:896
bool save_doc_words
Definition: dict.h:897
double stopper_certainty_per_char
Definition: dict.h:853
int stopper_debug_level
Definition: dict.h:856
double stopper_nondict_certainty_base
Definition: dict.h:847
char * user_patterns_suffix
Definition: dict.h:801
int def_letter_is_okay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:380
bool permute_fixed_length_dawg
Definition: dict.h:888
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:560
bool save_raw_choices
Definition: dict.h:864
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:278
double stopper_ambiguity_threshold_gain
Definition: dict.h:861
char * word_to_debug
Definition: dict.h:867
int fragments_debug
Definition: dict.h:870
bool load_punc_dawg
Definition: dict.h:806
double segment_penalty_dict_case_bad
Definition: dict.h:822
bool permute_script_word
Definition: dict.h:876
bool load_fixed_length_dawgs
Definition: dict.h:809
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:570
double segment_penalty_dict_case_ok
Definition: dict.h:818
bool permute_chartype_word
Definition: dict.h:890
bool stopper_no_acceptable_choices
Definition: dict.h:859
bool load_freq_dawg
Definition: dict.h:803
double segment_reward_script
Definition: dict.h:886
bool permute_only_top
Definition: dict.h:910
double bestrate_pruning_factor
Definition: dict.h:874
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:275
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:281
const Image * getImage() const
Definition: dict.h:94
tesseract::Dict::~Dict ( )

Definition at line 214 of file dict.cpp.

214  {
215  if (hyphen_word_ != NULL) delete hyphen_word_;
216  if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_);
217 }
#define NULL
Definition: host.h:144

Member Function Documentation

bool tesseract::Dict::absolute_garbage ( const WERD_CHOICE word,
const UNICHARSET unicharset 
)

Returns true if the word looks like an absolute garbage (e.g. image mistakenly recognized as text).

Definition at line 78 of file context.cpp.

79  {
80  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
81  int num_alphanum = 0;
82  for (int x = 0; x < word.length(); ++x) {
83  num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
84  unicharset.get_isdigit(word.unichar_id(x)));
85  }
86  return (static_cast<float>(num_alphanum) /
87  static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
88 }
int length() const
Definition: ratngs.h:214
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
bool tesseract::Dict::AcceptableChoice ( BLOB_CHOICE_LIST_VECTOR Choices,
WERD_CHOICE BestChoice,
DANGERR fixpt,
ACCEPTABLE_CHOICE_CALLER  caller,
bool *  modified_blobs 
)

Returns true if the given best_choice is good enough to stop.

Definition at line 191 of file stopper.cpp.

195  {
196  float CertaintyThreshold = stopper_nondict_certainty_base;
197  int WordSize;
198  if (modified_blobs != NULL) *modified_blobs = false;
199 
200  if (stopper_no_acceptable_choices) return false;
201 
202  if (fixpt != NULL) fixpt->clear();
203  if (BestChoice->length() == 0)
204  return false;
205  if (caller == CHOPPER_CALLER && BestChoice->fragment_mark()) {
206  if (stopper_debug_level >= 1) {
207  cprintf("AcceptableChoice(): a choice with fragments beats BestChoice");
208  }
209  return false;
210  }
211 
212  bool no_dang_ambigs = (GetMaxFixedLengthDawgIndex() >= 0 ||
213  NoDangerousAmbig(BestChoice, fixpt, true,
214  Choices, modified_blobs));
215  bool is_valid_word = valid_word_permuter(BestChoice->permuter(), false);
216  bool is_case_ok = case_ok(*BestChoice, getUnicharset());
217 
218  if (stopper_debug_level >= 1)
219  tprintf("\nStopper: %s (word=%c, case=%c)\n",
220  BestChoice->debug_string().string(),
221  (is_valid_word ? 'y' : 'n'),
222  (is_case_ok ? 'y' : 'n'));
223 
224  // Do not accept invalid words in PASS1.
225  if (reject_offset_ <= 0.0f && !is_valid_word) return false;
226  if (is_valid_word && is_case_ok) {
227  WordSize = LengthOfShortestAlphaRun(*BestChoice);
228  WordSize -= stopper_smallword_size;
229  if (WordSize < 0)
230  WordSize = 0;
231  CertaintyThreshold += WordSize * stopper_certainty_per_char;
232  }
233 
234  if (stopper_debug_level >= 1)
235  tprintf("Stopper: Certainty = %4.1f, Threshold = %4.1f\n",
236  BestChoice->certainty(), CertaintyThreshold);
237 
238  if (no_dang_ambigs &&
239  BestChoice->certainty() > CertaintyThreshold &&
240  UniformCertainties(*Choices, *BestChoice)) {
241  return true;
242  } else {
243  if (stopper_debug_level >= 2) {
244  tprintf("AcceptableChoice() returned false"
245  " (no_dang_ambig:%d cert:%g thresh:%g uniform:%d)\n",
246  no_dang_ambigs, BestChoice->certainty(),
247  CertaintyThreshold,
248  UniformCertainties(*Choices, *BestChoice));
249  }
250  return false;
251  }
252 }
int length() const
Definition: ratngs.h:214
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:58
const STRING debug_string() const
Definition: ratngs.h:373
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice)
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:883
virtual void clear()
float certainty() const
Definition: ratngs.h:234
const int GetMaxFixedLengthDawgIndex() const
Definition: dict.h:616
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
#define f(xc, yc)
Definition: imgscale.cpp:39
int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices, const WERD_CHOICE &BestChoice)
Definition: stopper.cpp:986
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, BLOB_CHOICE_LIST_VECTOR *Choices, bool *modified_blobs)
Definition: stopper.cpp:581
uinT8 permuter() const
Definition: ratngs.h:237
const char * string() const
Definition: strngs.cpp:156
int stopper_smallword_size
Definition: dict.h:851
double stopper_certainty_per_char
Definition: dict.h:853
int stopper_debug_level
Definition: dict.h:856
double stopper_nondict_certainty_base
Definition: dict.h:847
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool fragment_mark() const
Definition: ratngs.h:241
bool stopper_no_acceptable_choices
Definition: dict.h:859
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:679
bool tesseract::Dict::AcceptableResult ( const WERD_CHOICE BestChoice)

Returns false if the best choice for the current word is questionable and should be tried again on the second pass or should be flagged to the user.

Definition at line 254 of file stopper.cpp.

254  {
255  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
256  int WordSize;
257 
258  if (stopper_debug_level >= 1) {
259  tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n",
260  BestChoice.debug_string().string(),
261  (valid_word(BestChoice) ? 'y' : 'n'),
262  (case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'),
263  ((list_rest (best_choices_) != NIL_LIST) ? 'n' : 'y'));
264  }
265 
266  if (BestChoice.length() == 0 || CurrentWordAmbig())
267  return false;
268  if (BestChoice.fragment_mark()) {
269  if (stopper_debug_level >= 1) {
270  cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n");
271  }
272  return false;
273  }
274  if (valid_word(BestChoice) && case_ok(BestChoice, getUnicharset())) {
275  WordSize = LengthOfShortestAlphaRun(BestChoice);
276  WordSize -= stopper_smallword_size;
277  if (WordSize < 0)
278  WordSize = 0;
279  CertaintyThreshold += WordSize * stopper_certainty_per_char;
280  }
281 
282  if (stopper_debug_level >= 1)
283  cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
284  BestChoice.certainty(), CertaintyThreshold);
285 
286  if (BestChoice.certainty() > CertaintyThreshold &&
288  if (stopper_debug_level >= 1)
289  cprintf("ACCEPTED\n");
290  return true;
291  }
292  else {
293  if (stopper_debug_level >= 1)
294  cprintf("REJECTED\n");
295  return false;
296  }
297 }
int length() const
Definition: ratngs.h:214
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:58
const STRING debug_string() const
Definition: ratngs.h:373
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice)
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:883
float certainty() const
Definition: ratngs.h:234
#define NIL_LIST
Definition: oldlist.h:126
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
bool CurrentWordAmbig()
Returns true if there are multiple good choices for the current word.
Definition: stopper.cpp:325
#define list_rest(l)
Definition: oldlist.h:138
const char * string() const
Definition: strngs.cpp:156
int stopper_smallword_size
Definition: dict.h:851
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:807
double stopper_certainty_per_char
Definition: dict.h:853
int stopper_debug_level
Definition: dict.h:856
double stopper_nondict_certainty_base
Definition: dict.h:847
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool fragment_mark() const
Definition: ratngs.h:241
bool stopper_no_acceptable_choices
Definition: dict.h:859
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
void tesseract::Dict::add_document_word ( const WERD_CHOICE best_choice)

Adds a word found on this document to the document specific dictionary.

Definition at line 690 of file dict.cpp.

690  {
691  // Do not add hyphenated word parts to the document dawg.
692  // hyphen_word_ will be non-NULL after the set_hyphen_word() is
693  // called when the first part of the hyphenated word is
694  // discovered and while the second part of the word is recognized.
695  // hyphen_word_ is cleared in cc_recg() before the next word on
696  // the line is recognized.
697  if (hyphen_word_) return;
698 
699  char filename[CHARS_PER_LINE];
700  FILE *doc_word_file;
701  int stringlen = best_choice.length();
702 
703  if (!doc_dict_enable || valid_word(best_choice) ||
704  CurrentWordAmbig() || stringlen < 2)
705  return;
706 
707  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
708  if (best_choice.length() >= kDocDictMaxRepChars) {
709  int num_rep_chars = 1;
710  UNICHAR_ID uch_id = best_choice.unichar_id(0);
711  for (int i = 1; i < best_choice.length(); ++i) {
712  if (best_choice.unichar_id(i) != uch_id) {
713  num_rep_chars = 1;
714  uch_id = best_choice.unichar_id(i);
715  } else {
716  ++num_rep_chars;
717  if (num_rep_chars == kDocDictMaxRepChars) return;
718  }
719  }
720  }
721 
722  if (best_choice.certainty() < doc_dict_certainty_threshold ||
723  stringlen == 2) {
724  if (best_choice.certainty() < doc_dict_pending_threshold)
725  return;
726 
727  if (!pending_words_->word_in_dawg(best_choice)) {
728  if (stringlen > 2 ||
729  (stringlen == 2 &&
730  getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
731  getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
732  pending_words_->add_word_to_dawg(best_choice);
733  }
734  return;
735  }
736  }
737 
738  if (save_doc_words) {
739  strcpy(filename, getImage()->getCCUtil()->imagefile.string());
740  strcat(filename, ".doc");
741  doc_word_file = open_file (filename, "a");
742  fprintf(doc_word_file, "%s\n",
743  best_choice.debug_string().string());
744  fclose(doc_word_file);
745  }
746  document_words_->add_word_to_dawg(best_choice);
747 }
int length() const
Definition: ratngs.h:214
int UNICHAR_ID
Definition: unichar.h:31
double doc_dict_certainty_threshold
Definition: dict.h:902
const STRING debug_string() const
Definition: ratngs.h:373
bool doc_dict_enable
Definition: dict.h:898
FILE * open_file(const char *filename, const char *mode)
Definition: cutil.cpp:82
float certainty() const
Definition: ratngs.h:234
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:173
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
bool CurrentWordAmbig()
Returns true if there are multiple good choices for the current word.
Definition: stopper.cpp:325
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:48
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:406
#define CHARS_PER_LINE
Definition: cutil.h:57
char imagefile[]
const char * string() const
Definition: strngs.cpp:156
double doc_dict_pending_threshold
Definition: dict.h:900
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:807
bool save_doc_words
Definition: dict.h:897
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
const Image * getImage() const
Definition: dict.h:94
void tesseract::Dict::AddNewChunk ( VIABLE_CHOICE  Choice,
int  Blob 
)

Increments the chunk count of the character in Choice which corresponds to Blob (index of the blob being split).

Definition at line 788 of file stopper.cpp.

788  {
789  int i, LastChunk;
790  for (i = 0, LastChunk = 0; i < Choice->Length; i++) {
791  LastChunk += Choice->Blob[i].NumChunks;
792  if (Blob < LastChunk) {
793  (Choice->Blob[i].NumChunks)++;
794  return;
795  }
796  }
797  cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
798  Choice->Length, LastChunk, Blob);
799  assert(false); // this should never get executed
800 }
CHAR_CHOICE * Blob
Definition: stopper.h:74
uinT16 NumChunks
Definition: stopper.h:52
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
void tesseract::Dict::adjust_non_word ( WERD_CHOICE word,
float *  certainty_array,
const BLOB_CHOICE_LIST_VECTOR char_choices,
bool  debug 
)
inline

Definition at line 719 of file dict.h.

722  {
723  adjust_word(word, certainty_array, char_choices, true, 0.0f, debug);
724  }
#define f(xc, yc)
Definition: imgscale.cpp:39
void adjust_word(WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool nonword, float additional_adjust, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:749
void tesseract::Dict::adjust_word ( WERD_CHOICE word,
float *  certainty_array,
const BLOB_CHOICE_LIST_VECTOR char_choices,
bool  nonword,
float  additional_adjust,
bool  debug 
)

Adjusts the rating of the given word.

Definition at line 749 of file dict.cpp.

754  {
755  bool is_han = (char_choices != NULL &&
757  get_top_word_script(*char_choices, getUnicharset()) ==
758  getUnicharset().han_sid());
759  bool case_is_ok = (is_han || case_ok(*word, getUnicharset()));
760  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
761 
762  float adjust_factor = additional_adjust;
763  float new_rating = word->rating();
764  if (debug) {
765  tprintf("%sWord: %s %4.2f ", nonword ? "Non-" : "",
766  word->debug_string().string(), word->rating());
767  }
768  new_rating += kRatingPad;
769  if (nonword) { // non-dictionary word
770  if (case_is_ok && punc_is_ok) {
771  adjust_factor += segment_penalty_dict_nonword;
772  new_rating *= adjust_factor;
773  if (debug) tprintf(", W");
774  } else {
775  adjust_factor += segment_penalty_garbage;
776  new_rating *= adjust_factor;
777  if (debug) {
778  if (!case_is_ok) tprintf(", C");
779  if (!punc_is_ok) tprintf(", P");
780  }
781  }
782  } else { // dictionary word
783  if (case_is_ok) {
784  if (!is_han && freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
785  word->set_permuter(FREQ_DAWG_PERM);
786  adjust_factor += segment_penalty_dict_frequent_word;
787  new_rating *= adjust_factor;
788  if (debug) tprintf(", F");
789  } else {
790  adjust_factor += segment_penalty_dict_case_ok;
791  new_rating *= adjust_factor;
792  if (debug) tprintf(", ");
793  }
794  } else {
795  adjust_factor += segment_penalty_dict_case_bad;
796  new_rating *= adjust_factor;
797  if (debug) tprintf(", C");
798  }
799  }
800  new_rating -= kRatingPad;
801  word->set_rating(new_rating);
802  if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
803  LogNewChoice(adjust_factor, certainty_array, false, word,
804  *char_choices);
805 }
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:879
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:58
void set_rating(float new_val)
Definition: ratngs.h:255
int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices, const UNICHARSET &unicharset)
Definition: dict.cpp:908
const STRING debug_string() const
Definition: ratngs.h:373
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:48
double segment_penalty_garbage
Definition: dict.h:835
double segment_penalty_dict_frequent_word
Definition: dict.h:814
double segment_penalty_dict_nonword
Definition: dict.h:830
const char * string() const
Definition: strngs.cpp:156
int null_sid() const
Definition: unicharset.h:752
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
double segment_penalty_dict_case_bad
Definition: dict.h:822
double segment_penalty_dict_case_ok
Definition: dict.h:818
int han_sid() const
Definition: unicharset.h:757
float rating() const
Definition: ratngs.h:231
void LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[], bool raw_choice, WERD_CHOICE *WordChoice, const BLOB_CHOICE_LIST_VECTOR &blob_choices)
Definition: stopper.cpp:484
void set_permuter(uinT8 perm)
Definition: ratngs.h:261
void tesseract::Dict::adjust_word ( WERD_CHOICE word,
float *  certainty_array,
const BLOB_CHOICE_LIST_VECTOR char_choices,
bool  debug 
)
inline

Definition at line 713 of file dict.h.

715  {
716  adjust_word(word, certainty_array, char_choices, false, 0.0f,
717  debug);
718  }
#define f(xc, yc)
Definition: imgscale.cpp:39
void adjust_word(WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool nonword, float additional_adjust, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:749
bool tesseract::Dict::AlternativeChoicesWorseThan ( FLOAT32  Threshold)

Returns true if there are no alternative choices for the current word or if all alternatives have an adjust factor worse than Threshold.

Definition at line 299 of file stopper.cpp.

299  {
300  LIST Alternatives;
301  VIABLE_CHOICE Choice;
302  Alternatives = list_rest (best_choices_);
303  iterate(Alternatives) {
304  Choice = (VIABLE_CHOICE) first_node (Alternatives);
305  if (Choice->AdjustFactor <= Threshold)
306  return false;
307  }
308  return true;
309 }
#define list_rest(l)
Definition: oldlist.h:138
VIABLE_CHOICE_STRUCT * VIABLE_CHOICE
Definition: stopper.h:86
FLOAT32 AdjustFactor
Definition: stopper.h:72
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
bool tesseract::Dict::ambigs_mode ( float  rating_limit)
inline

Returns true if we are operating in ambigs mode.

Definition at line 180 of file dict.h.

180  {
181  return rating_limit <= 0.0;
182  }
void tesseract::Dict::append_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
const BLOB_CHOICE blob_choice,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

append_choices

Checks to see whether or not the next choice is worth appending to the word being generated. If so then keeps going deeper into the word.

This function assumes that Dict::go_deeper_fxn_ is set.

Definition at line 1479 of file permute.cpp.

1490  {
1491  int word_ending =
1492  (char_choice_index == char_choices.length() - 1) ? true : false;
1493 
1494  // Deal with fragments.
1495  CHAR_FRAGMENT_INFO char_frag_info;
1496  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
1497  blob_choice.certainty(), prev_char_frag_info, debug,
1498  word_ending, &char_frag_info)) {
1499  return; // blob_choice must be an invalid fragment
1500  }
1501  // Search the next letter if this character is a fragment.
1502  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
1503  permute_choices(debug, char_choices, char_choice_index + 1,
1504  &char_frag_info, word, certainties, limit,
1505  best_choice, attempts_left, more_args);
1506  return;
1507  }
1508 
1509  // Add the next unichar.
1510  float old_rating = word->rating();
1511  float old_certainty = word->certainty();
1512  uinT8 old_permuter = word->permuter();
1513  certainties[word->length()] = char_frag_info.certainty;
1515  char_frag_info.unichar_id, char_frag_info.num_fragments,
1516  char_frag_info.rating, char_frag_info.certainty);
1517 
1518  // Explore the next unichar.
1519  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
1520  &char_frag_info, word_ending, word, certainties,
1521  limit, best_choice, attempts_left, more_args);
1522 
1523  // Remove the unichar we added to explore other choices in it's place.
1524  word->remove_last_unichar_id();
1525  word->set_rating(old_rating);
1526  word->set_certainty(old_certainty);
1527  word->set_permuter(old_permuter);
1528 }
int length() const
Definition: ratngs.h:214
void set_rating(float new_val)
Definition: ratngs.h:255
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Semi-generic functions used by multiple permuters.
Definition: permute.cpp:1283
int num_fragments
Definition: dict.h:40
float certainty() const
Definition: ratngs.h:234
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function that will be modified by various permuters.
Definition: dict.h:308
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
float rating
Definition: dict.h:41
UNICHAR_ID unichar_id
Definition: dict.h:38
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permute.cpp:1437
float certainty() const
Definition: ratngs.h:65
uinT8 permuter() const
Definition: ratngs.h:237
void remove_last_unichar_id()
Definition: ratngs.h:356
void set_certainty(float new_val)
Definition: ratngs.h:258
int length() const
Definition: genericvector.h:63
unsigned char uinT8
Definition: host.h:99
float certainty
Definition: dict.h:42
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, char fragment_length, float rating, float certainty)
Definition: ratngs.h:331
float rating() const
Definition: ratngs.h:231
void set_permuter(uinT8 perm)
Definition: ratngs.h:261
float rating() const
Definition: ratngs.h:62
int tesseract::Dict::case_ok ( const WERD_CHOICE word,
const UNICHARSET unicharset 
)

Check a string to see if it matches a set of lexical rules.

Definition at line 58 of file context.cpp.

58  {
59  int last_state = 0;
60  int state = 0;
61  int x;
62  for (x = 0; x < word.length(); ++x) {
63  UNICHAR_ID ch_id = word.unichar_id(x);
64  if (unicharset.get_isupper(ch_id))
65  state = case_state_table[state][1];
66  else if (unicharset.get_islower(ch_id))
67  state = case_state_table[state][2];
68  else if (unicharset.get_isdigit(ch_id))
69  state = case_state_table[state][3];
70  else
71  state = case_state_table[state][0];
72  if (state == -1) return false;
73  last_state = state;
74  }
75  return state != 5; // single lower is bad
76 }
int length() const
Definition: ratngs.h:214
int UNICHAR_ID
Definition: unichar.h:31
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:399
const int case_state_table[6][4]
Definition: context.cpp:35
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:406
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
bool tesseract::Dict::ChoiceAccumEnabled ( )
inline

Definition at line 346 of file dict.h.

346 { return keep_word_choices_; }
int tesseract::Dict::ChoiceSameAs ( const WERD_CHOICE WordChoice,
VIABLE_CHOICE  ViableChoice 
)

Compares the corresponding strings of WordChoice and ViableChoice and returns true if they are the same.

Definition at line 878 of file stopper.cpp.

879  {
880  return (StringSameAs(WordChoice, ViableChoice));
881 }
bool StringSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice)
Definition: stopper.cpp:951
const char * tesseract::Dict::choose_il1 ( const char *  first_char,
const char *  second_char,
const char *  third_char,
const char *  prev_char,
const char *  next_char,
const char *  next_next_char 
)

Definition at line 1163 of file permute.cpp.

1168  {
1169  inT32 type1; //1/I/l type of first choice
1170  inT32 type2; //1/I/l type of second choice
1171  inT32 type3; //1/I/l type of third choice
1172 
1173  int first_char_length = strlen(first_char);
1174  int prev_char_length = strlen(prev_char);
1175  int next_char_length = strlen(next_char);
1176  int next_next_char_length = strlen(next_next_char);
1177 
1178  if (*first_char == 'l' && *second_char != '\0') {
1179  if (*second_char == 'I'
1180  && (((prev_char_length != 0 &&
1181  getUnicharset().get_isupper (prev_char, prev_char_length)) &&
1182  (next_char_length == 0 ||
1183  !getUnicharset().get_islower (next_char, next_char_length)) &&
1184  (next_char_length == 0 ||
1185  !getUnicharset().get_isdigit (next_char, next_char_length))) ||
1186  ((next_char_length != 0 &&
1187  getUnicharset().get_isupper (next_char, next_char_length)) &&
1188  (prev_char_length == 0 ||
1189  !getUnicharset().get_islower (prev_char, prev_char_length)) &&
1190  (prev_char_length == 0 ||
1191  !getUnicharset().get_isdigit (prev_char, prev_char_length)))))
1192  first_char = second_char; //override
1193  else if (*second_char == '1' || *third_char == '1') {
1194  if ((next_char_length != 0 &&
1195  getUnicharset().get_isdigit (next_char, next_char_length)) ||
1196  (prev_char_length != 0 &&
1197  getUnicharset().get_isdigit (prev_char, prev_char_length))
1198  || (*next_char == 'l' &&
1199  (next_next_char_length != 0 &&
1200  getUnicharset().get_isdigit (next_next_char,
1201  next_next_char_length)))) {
1202  first_char = "1";
1203  first_char_length = 1;
1204  }
1205  else if ((prev_char_length == 0 ||
1206  !getUnicharset().get_islower (prev_char, prev_char_length)) &&
1207  ((next_char_length == 0 ||
1208  !getUnicharset().get_islower (next_char, next_char_length)) ||
1209  (*next_char == 's' &&
1210  *next_next_char == 't'))) {
1211  if (((*prev_char != '\'' && *prev_char != '`') || *next_char != '\0')
1212  && ((*next_char != '\'' && *next_char != '`')
1213  || *prev_char != '\0')) {
1214  first_char = "1";
1215  first_char_length = 1;
1216  }
1217  }
1218  }
1219  if (*first_char == 'l' && *next_char != '\0' &&
1220  (prev_char_length == 0 ||
1221  !getUnicharset().get_isalpha (prev_char, prev_char_length))) {
1222  type1 = 2;
1223 
1224  if (*second_char == '1')
1225  type2 = 0;
1226  else if (*second_char == 'I')
1227  type2 = 1;
1228  else if (*second_char == 'l')
1229  type2 = 2;
1230  else
1231  type2 = type1;
1232 
1233  if (*third_char == '1')
1234  type3 = 0;
1235  else if (*third_char == 'I')
1236  type3 = 1;
1237  else if (*third_char == 'l')
1238  type3 = 2;
1239  else
1240  type3 = type1;
1241 
1242 #if 0
1243  if (bigram_counts[*next_char][type2] >
1244  bigram_counts[*next_char][type1]) {
1245  first_char = second_char;
1246  type1 = type2;
1247  }
1248  if (bigram_counts[*next_char][type3] >
1249  bigram_counts[*next_char][type1]) {
1250  first_char = third_char;
1251  }
1252 #endif
1253  }
1254  }
1255  return first_char;
1256 }
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
int inT32
Definition: host.h:102
void tesseract::Dict::ClearBestChoiceAccum ( )

Clears best_choices_ list accumulated by the stopper.

Definition at line 458 of file stopper.cpp.

458  {
459  if (best_choices_) destroy_nodes(best_choices_, DeleteViableChoiceStruct);
460  best_choices_ = NIL_LIST;
461 }
void DeleteViableChoiceStruct(void *vcs)
Definition: stopper.cpp:59
#define NIL_LIST
Definition: oldlist.h:126
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:204
bool tesseract::Dict::compound_marker ( UNICHAR_ID  unichar_id)
inline

Definition at line 110 of file dict.h.

110  {
111  return (unichar_id == getUnicharset().unichar_to_id("-") ||
112  unichar_id == getUnicharset().unichar_to_id("/"));
113  }
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
bool tesseract::Dict::ConstraintsOk ( const DawgInfoVector constraints,
int  word_end,
DawgType  current_dawg_type 
) const
inline

At word ending make sure all the recorded constraints are satisfied. Each constraint signifies that we found a beginning pattern in a pattern dawg. Check that this pattern can end here (e.g. if some leading punctuation is found this would ensure that we are not expecting any particular trailing punctuation after the word).

Definition at line 631 of file dict.h.

632  {
633  if (!word_end) return true;
634  if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true;
635  for (int c = 0; c < constraints.length(); ++c) {
636  const DawgInfo &cinfo = constraints[c];
637  Dawg *cdawg = dawgs_[cinfo.dawg_index];
638  if (!cdawg->end_of_word(cinfo.ref)) {
639  if (dawg_debug_level >= 3) {
640  tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n",
641  cinfo.dawg_index, cinfo.ref);
642  }
643  return false;
644  }
645  }
646  return true;
647  }
#define REFFORMAT
Definition: dawg.h:92
int dawg_debug_level
Definition: dict.h:839
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void tesseract::Dict::copy_hyphen_info ( WERD_CHOICE word) const
inline

If this word is hyphenated copy the base word (the part on the line before) of a hyphenated word into the given word. This function assumes that word is not NULL.

Definition at line 128 of file dict.h.

128  {
129  if (this->hyphenated()) {
130  *word = *hyphen_word_;
131  if (hyphen_debug_level) word->print("copy_hyphen_info: ");
132  }
133  }
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:118
int hyphen_debug_level
Definition: dict.h:840
const void print() const
Definition: ratngs.h:406
FLOAT32 tesseract::Dict::CurrentBestChoiceAdjustFactor ( )

Returns the adjustment factor for the best choice for the current word.

Definition at line 316 of file stopper.cpp.

316  {
317  VIABLE_CHOICE BestChoice;
318  if (best_choices_ == NIL_LIST)
319  return (MAX_FLOAT32);
320  BestChoice = (VIABLE_CHOICE) first_node (best_choices_);
321  return (BestChoice->AdjustFactor);
322 }
#define NIL_LIST
Definition: oldlist.h:126
VIABLE_CHOICE_STRUCT * VIABLE_CHOICE
Definition: stopper.h:86
#define MAX_FLOAT32
Definition: host.h:124
FLOAT32 AdjustFactor
Definition: stopper.h:72
#define first_node(l)
Definition: oldlist.h:139
bool tesseract::Dict::CurrentBestChoiceIs ( const WERD_CHOICE WordChoice)

Returns true if WordChoice is the same as the current best choice.

Definition at line 311 of file stopper.cpp.

311  {
312  return (best_choices_ != NIL_LIST &&
313  StringSameAs(WordChoice, (VIABLE_CHOICE)first_node(best_choices_)));
314 }
#define NIL_LIST
Definition: oldlist.h:126
bool StringSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice)
Definition: stopper.cpp:951
#define first_node(l)
Definition: oldlist.h:139
bool tesseract::Dict::CurrentWordAmbig ( )

Returns true if there are multiple good choices for the current word.

Definition at line 325 of file stopper.cpp.

325  {
326  return (list_rest (best_choices_) != NIL_LIST);
327 }
#define NIL_LIST
Definition: oldlist.h:126
#define list_rest(l)
Definition: oldlist.h:138
WERD_CHOICE * tesseract::Dict::dawg_permute_and_select ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float  rating_limit,
int  sought_word_length,
int  start_char_choice_index 
)

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to explore all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

dawg_permute_and_select

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to search all the dawgs in the dawgs_ vector in parallel and discard invalid words.

If sought_word_length is not kAnyWordLength, the function only searches for a valid word formed by the given char_choices in one fixed length dawg (that contains words of length sought_word_length) starting at the start_char_choice_index.

Allocate and return a WERD_CHOICE with the best valid word found.

Definition at line 263 of file permdawg.cpp.

265  {
266  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
267  best_choice->make_bad();
268  best_choice->set_rating(rating_limit);
269  if (char_choices.length() == 0) return best_choice;
270  DawgInfoVector *active_dawgs = new DawgInfoVector[char_choices.length() + 1];
271  DawgInfoVector *constraints = new DawgInfoVector[char_choices.length() + 1];
272  init_active_dawgs(sought_word_length, &(active_dawgs[0]),
273  ambigs_mode(rating_limit));
274  init_constraints(&(constraints[0]));
275  int end_char_choice_index = (sought_word_length == kAnyWordLength) ?
276  char_choices.length()-1 : start_char_choice_index+sought_word_length-1;
277  // Need to skip accumulating word choices if we are only searching a part of
278  // the word (e.g. for the phrase search in non-space delimited languages).
279  // Also need to skip accumulating choices if char_choices are expanded
280  // with ambiguities.
281  bool re_enable_choice_accum = ChoiceAccumEnabled();
282  if (sought_word_length != kAnyWordLength ||
283  ambigs_mode(rating_limit)) DisableChoiceAccum();
284  DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
285  &(active_dawgs[1]), &(constraints[1]),
288  NO_PERM, sought_word_length, end_char_choice_index);
290  copy_hyphen_info(&word);
291  // Discard rating and certainty of the hyphen base (if any).
292  word.set_rating(0.0);
293  word.set_certainty(0.0);
294  if (word.length() + char_choices.length() > MAX_WERD_LENGTH) {
295  delete[] active_dawgs;
296  delete[] constraints;
297  return best_choice; // the word is too long to permute
298  }
299  float certainties[MAX_WERD_LENGTH];
301  int attempts_left = max_permuter_attempts;
303  "permute_dawg_debug" : NULL,
304  char_choices, start_char_choice_index, NULL, &word,
305  certainties, &rating_limit, best_choice, &attempts_left,
306  &dawg_args);
307  delete[] active_dawgs;
308  delete[] constraints;
309  if (re_enable_choice_accum) EnableChoiceAccum();
310  return best_choice;
311 }
void set_rating(float new_val)
Definition: ratngs.h:255
bool ambigs_mode(float rating_limit)
Returns true if we are operating in ambigs mode.
Definition: dict.h:180
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:321
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:128
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function that will be modified by various permuters.
Definition: dict.h:308
bool permute_debug
Definition: dict.h:872
void EnableChoiceAccum()
Definition: dict.h:345
int max_permuter_attempts
Definition: dict.h:909
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permute.cpp:1437
void init_active_dawgs(int sought_word_length, DawgInfoVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:643
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:67
int dawg_debug_level
Definition: dict.h:839
void DisableChoiceAccum()
Definition: dict.h:344
void init_constraints(DawgInfoVector *constraints) const
Definition: dict.cpp:677
int length() const
Definition: genericvector.h:63
bool ChoiceAccumEnabled()
Definition: dict.h:346
double segment_penalty_dict_case_bad
Definition: dict.h:822
#define MAX_WERD_LENGTH
Definition: dict.h:33
double segment_penalty_dict_case_ok
Definition: dict.h:818
WERD_CHOICE* tesseract::Dict::dawg_permute_and_select ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float  rating_limit 
)
inline

Definition at line 191 of file dict.h.

192  {
193  return dawg_permute_and_select(char_choices, rating_limit,
194  kAnyWordLength, 0);
195  }
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, int sought_word_length, int end_char_choice_index)
Definition: permdawg.cpp:263
void tesseract::Dict::DebugWordChoices ( )

Prints the current choices for this word to stdout.

Definition at line 330 of file stopper.cpp.

330  {
331  LIST Choices;
332  int i;
333  char LabelString[80];
334  VIABLE_CHOICE VChoice = (VIABLE_CHOICE)first_node(best_choices_);
335  bool force_debug =
336  fragments_debug && VChoice != NULL && VChoice->ComposedFromCharFragments;
337 
338  if (stopper_debug_level >= 1 || force_debug ||
339  (((STRING)word_to_debug).length() > 0 && best_choices_ &&
340  StringSameAs(word_to_debug.string(), word_to_debug_lengths.string(),
341  (VIABLE_CHOICE)first_node(best_choices_)))) {
342  if (best_raw_choice_)
343  PrintViableChoice(stderr, "\nBest Raw Choice: ", best_raw_choice_);
344 
345  i = 1;
346  Choices = best_choices_;
347  if (Choices)
348  cprintf("\nBest Cooked Choices:\n");
349  iterate(Choices) {
350  sprintf(LabelString, "Cooked Choice #%d: ", i);
351  PrintViableChoice(stderr, LabelString,
352  (VIABLE_CHOICE)first_node(Choices));
353  i++;
354  }
355  }
356 }
#define NULL
Definition: host.h:144
void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice)
Dumps a text representation of the specified Choice to File.
Definition: stopper.cpp:912
VIABLE_CHOICE_STRUCT * VIABLE_CHOICE
Definition: stopper.h:86
bool StringSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice)
Definition: stopper.cpp:951
bool ComposedFromCharFragments
Definition: stopper.h:73
char * word_to_debug_lengths
Definition: dict.h:869
const char * string() const
Definition: strngs.cpp:156
int stopper_debug_level
Definition: dict.h:856
Definition: strngs.h:40
char * word_to_debug
Definition: dict.h:867
int fragments_debug
Definition: dict.h:870
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
int tesseract::Dict::def_letter_is_okay ( void *  void_dawg_args,
UNICHAR_ID  unichar_id,
bool  word_end 
) const

Returns the maximal permuter code (from ccstruct/ratngs.h) if in light of the current state the letter at word_index in the given word is allowed according to at least one of the dawgs in dawgs_, otherwise returns NO_PERM.

The state is described by void_dawg_args, which are interpreted as DawgArgs and contain two relevant input vectors: active_dawgs and constraints. Each entry in the active_dawgs vector contains an index into the dawgs_ vector and an EDGE_REF that indicates the last edge followed in the dawg. Each entry in the constraints vector contains an index into the dawgs_ vector and an EDGE_REF that indicates an edge in a pattern dawg followed to match a pattern. Currently constraints are used to save the state of punctuation dawgs after leading punctuation was found.

Input: At word_index 0 dawg_args->active_dawgs should contain an entry for each dawg whose type has a bit set in kBeginningDawgsType, dawg_args->constraints should be empty. EDGE_REFs in active_dawgs and constraints vectors should be initialized to NO_EDGE. If hyphen state needs to be applied, initial dawg_args->active_dawgs and dawg_args->constrains can be copied from the saved hyphen state (maintained by Dict). For word_index > 0 the corresponding state (active_dawgs and constraints) can be obtained from dawg_args->updated_* passed to def_letter_is_okay for word_index-1. Note: the function assumes that active_dags, constraints and updated_* member variables of dawg_args are not NULL.

Output: The function fills in dawg_args->updated_active_dawgs vector with the entries for dawgs that contain the word up to the letter at word_index. The new constraints (if any) are added to dawg_args->updated_constraints, the constraints from dawg_args->constraints are also copied into it.

Detailed description: In order to determine whether the word is still valid after considering all the letters up to the one at word_index the following is done for each entry in dawg_args->active_dawgs:

  • next starting node is obtained from entry.ref and edge_char_of() is called to obtain the next edge
  • if a valid edge is found, the function returns the updated permuter code true and an entry [entry.dawg_index, edge] is inserted in dawg_args->updated_active_dawgs otherwise:
    • if we are dealing with dawg of type DAWG_TYPE_PUNCTUATION, edge_char_of() is called again, but now with kPatternUnicharID as unichar_id; if a valid edge is found it is recorded in dawg_args->updated_constraints
    • the function checks whether the word can end with the previous letter
    • each successor of the dawg (e.g. dawgs with type DAWG_TYPE_WORD could be successors to dawgs with type DAWG_TYPE_PUNCTUATION; the successors are defined by successors_ vector) is explored and if a letter is found in the successor dawg, a new entry is inserted into dawg_args->updated_active_dawgs with EDGE_REF being either NO_EDGE or an EDGE_REF recorded in constraints vector for the corresponding dawg index

Definition at line 380 of file dict.cpp.

382  {
383  DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);
384 
385  if (dawg_debug_level >= 3) {
386  tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
387  " num active dawgs=%d num constraints=%d\n",
388  getUnicharset().debug_str(unichar_id).string(), word_end,
389  dawg_args->active_dawgs->length(),
390  dawg_args->constraints->length());
391  }
392 
393  // Do not accept words that contain kPatternUnicharID.
394  // (otherwise pattern dawgs would not function correctly).
395  // Do not accept words containing INVALID_UNICHAR_IDs.
396  if (unichar_id == Dawg::kPatternUnicharID ||
397  unichar_id == INVALID_UNICHAR_ID) {
398  dawg_args->permuter = NO_PERM;
399  return NO_PERM;
400  }
401 
402  // Initialization.
403  PermuterType curr_perm = NO_PERM;
404  dawg_args->updated_active_dawgs->clear();
405  const DawgInfoVector &constraints = *(dawg_args->constraints);
406  *dawg_args->updated_constraints = constraints;
407 
408  // Go over the active_dawgs vector and insert DawgInfo records with the
409  // updated ref (an edge with the corresponding unichar id) into
410  // dawg_args->updated_active_dawgs.
411  for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
412  const DawgInfo &info = (*dawg_args->active_dawgs)[a];
413  const Dawg *dawg = dawgs_[info.dawg_index];
414  // dawg_unichar_id will contain the literal unichar_id to be found in the
415  // dawgs (e.g. didgit pattern if unichar_id is a digit and dawg contains
416  // number patterns, word pattern if dawg is a puncutation dawg and we
417  // reached an end of beginning puntuation pattern, etc).
418  UNICHAR_ID dawg_unichar_id = unichar_id;
419 
420  // If we are dealing with the pattern dawg, look up all the
421  // possible edges, not only for the exact unichar_id, but also
422  // for all its character classes (alpha, digit, etc).
423  if (dawg->type() == DAWG_TYPE_PATTERN) {
424  ProcessPatternEdges(dawg, info, dawg_unichar_id, word_end,
425  dawg_args, &curr_perm);
426  // There can't be any successors to dawg that is of type
427  // DAWG_TYPE_PATTERN, so we are done examining this DawgInfo.
428  continue;
429  }
430 
431  // The number dawg generalizes all digits to be kPatternUnicharID,
432  // so try to match kPatternUnicharID if the current unichar is a digit.
433  if (dawg->type() == DAWG_TYPE_NUMBER &&
434  getUnicharset().get_isdigit(dawg_unichar_id)) {
435  dawg_unichar_id = Dawg::kPatternUnicharID;
436  }
437 
438  // Find the edge out of the node for the dawg_unichar_id.
439  NODE_REF node = GetStartingNode(dawg, info.ref);
440  EDGE_REF edge = (node != NO_EDGE) ?
441  dawg->edge_char_of(node, dawg_unichar_id, word_end) : NO_EDGE;
442 
443  if (dawg_debug_level >= 3) {
444  tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
445  info.dawg_index, node, edge);
446  }
447 
448  if (edge != NO_EDGE) { // the unichar was found in the current dawg
449  if (ConstraintsOk(*(dawg_args->updated_constraints),
450  word_end, dawg->type())) {
451  if (dawg_debug_level >=3) {
452  tprintf("Letter found in dawg %d\n", info.dawg_index);
453  }
454  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
455  dawg_args->updated_active_dawgs->add_unique(
456  DawgInfo(info.dawg_index, edge), dawg_debug_level > 0,
457  "Append current dawg to updated active dawgs: ");
458  }
459  } else if (dawg_args->sought_word_length == kAnyWordLength) {
460  // The unichar was not found in the current dawg.
461  // Explore the successor dawgs (but only if we are not
462  // just searching one dawg with a fixed word length).
463 
464  // Handle leading/trailing punctuation dawgs that denote a word pattern
465  // as an edge with kPatternUnicharID. If such an edge is found we add a
466  // constraint denoting the state of the dawg before the word pattern.
467  // This constraint will be applied later when this dawg is found among
468  // successor dawgs as well potentially at the end of the word.
469  if (dawg->type() == DAWG_TYPE_PUNCTUATION) {
470  edge = dawg->edge_char_of(node, Dawg::kPatternUnicharID, word_end);
471  if (edge != NO_EDGE) {
472  dawg_args->updated_constraints->add_unique(
473  DawgInfo(info.dawg_index, edge), dawg_debug_level > 0,
474  "Recording constraint: ");
475  } else {
476  // Do not explore successors of this dawg, since this
477  // must be invalid leading or trailing punctuation.
478  if (dawg_debug_level >= 3) {
479  tprintf("Invalid punctuation from dawg %d\n", info.dawg_index);
480  }
481  continue;
482  }
483  }
484 
485  if (info.ref == NO_EDGE) {
486  if (dawg_debug_level >= 3) {
487  tprintf("No letters matched in dawg %d\n", info.dawg_index);
488  }
489  continue;
490  }
491 
492  // Discard the dawg if the pattern can not end at previous letter.
493  if (edge == NO_EDGE && // previous part is not leading punctuation
494  !dawg->end_of_word(info.ref)) {
495  if (dawg_debug_level >= 3) {
496  tprintf("No valid pattern end in dawg %d\n", info.dawg_index);
497  }
498  continue;
499  }
500 
501  // Look for the unichar in each of this dawg's successors
502  // and append those in which it is found to active_dawgs.
503  const SuccessorList &slist = *(successors_[info.dawg_index]);
504  for (int s = 0; s < slist.length(); ++s) {
505  int sdawg_index = slist[s];
506  const Dawg *sdawg = dawgs_[sdawg_index];
507  NODE_REF snode = 0;
508  // Apply constraints to the successor dawg.
509  for (int c = 0; c < constraints.length(); ++c) {
510  // If the successor dawg is described in the constraints change
511  // the start ref from 0 to the one recorded as the constraint.
512  const DawgInfo &cinfo = constraints[c];
513  if (cinfo.dawg_index == sdawg_index) {
514  snode = sdawg->next_node(cinfo.ref);
515  // Make sure we do not search the successor dawg if after
516  // applying the saved constraint we are at the end of the word.
517  if (snode == 0) snode = NO_EDGE;
518  if (dawg_debug_level >= 3) {
519  tprintf("Applying constraint [%d, " REFFORMAT "]\n",
520  sdawg_index, snode);
521  }
522  }
523  }
524  // Look for the letter in this successor dawg.
525  EDGE_REF sedge = sdawg->edge_char_of(snode, unichar_id, word_end);
526  // If we found the letter append sdawg to the active_dawgs list.
527  if (sedge != NO_EDGE &&
528  ConstraintsOk(*(dawg_args->updated_constraints), word_end,
529  dawgs_[sdawg_index]->type())) {
530  if (dawg_debug_level >= 3) {
531  tprintf("Letter found in the successor dawg %d\n", sdawg_index);
532  }
533  if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
534  if (sdawg->next_node(sedge) != 0) { // if not word end
535  dawg_args->updated_active_dawgs->add_unique(
536  DawgInfo(sdawg_index, sedge), dawg_debug_level > 0,
537  "Append successor to updated active dawgs: ");
538  }
539  }
540  } // end successors loop
541  } // end if/else
542  } // end for
543  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
544  // or if we found the current letter in a non-punctuation dawg. This
545  // allows preserving information on which dawg the "core" word came from.
546  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
547  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
548  (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
549  dawg_args->permuter = curr_perm;
550  }
551  return dawg_args->permuter;
552 }
#define REFFORMAT
Definition: dawg.h:92
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:620
int UNICHAR_ID
Definition: unichar.h:31
void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:554
inT64 EDGE_REF
Definition: dawg.h:54
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
bool ConstraintsOk(const DawgInfoVector &constraints, int word_end, DawgType current_dawg_type) const
Definition: dict.h:631
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:129
int dawg_debug_level
Definition: dict.h:839
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT64 NODE_REF
Definition: dawg.h:55
GenericVector< int > SuccessorList
Definition: dawg.h:68
double tesseract::Dict::def_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Default (no-op) implementation of probability in context function.

Definition at line 587 of file dict.h.

589  {
590  (void) context;
591  (void) context_bytes;
592  (void) character;
593  (void) character_bytes;
594  return 0.0;
595  }
void tesseract::Dict::DisableChoiceAccum ( )
inline

Definition at line 344 of file dict.h.

344 { keep_word_choices_ = false; }
void tesseract::Dict::EnableChoiceAccum ( )
inline

Definition at line 345 of file dict.h.

345 { keep_word_choices_ = true; }
void tesseract::Dict::End ( )

Definition at line 335 of file dict.cpp.

335  {
336  if (dawgs_.length() == 0)
337  return; // Not safe to call twice.
338  dawgs_.delete_data_pointers();
339  successors_.delete_data_pointers();
340  dawgs_.clear();
341  delete bigram_dawg_;
342  successors_.clear();
343  document_words_ = NULL;
344  max_fixed_length_dawgs_wdlen_ = -1;
345  if (pending_words_ != NULL) {
346  delete pending_words_;
347  pending_words_ = NULL;
348  }
349 }
void delete_data_pointers()
virtual void clear()
#define NULL
Definition: host.h:144
int length() const
Definition: genericvector.h:63
void tesseract::Dict::end_permute ( )
void tesseract::Dict::EndDangerousAmbigs ( )

Definition at line 778 of file stopper.cpp.

778 {}
void tesseract::Dict::FillViableChoice ( const WERD_CHOICE WordChoice,
FLOAT32  AdjustFactor,
const float  Certainties[],
VIABLE_CHOICE  ViableChoice 
)

Fill ViableChoice with information from WordChoice, AChoice, AdjustFactor, and Certainties.

Definition at line 943 of file stopper.cpp.

945  {
946  ViableChoice->Init(WordChoice, current_segmentation_, Certainties,
947  AdjustFactor);
948 
949 }
void Init(const WERD_CHOICE &word_choice, const PIECES_STATE &pieces_state, const float certainties[], FLOAT32 adjust_factor)
Definition: stopper.cpp:120
void tesseract::Dict::FilterWordChoices ( )

Removes from best_choices_ all choices which are not within a reasonable range of the best choice.

Definition at line 375 of file stopper.cpp.

375  {
376  EXPANDED_CHOICE BestChoice;
377 
378  if (best_choices_ == NIL_LIST || second_node (best_choices_) == NIL_LIST)
379  return;
380 
381  // Compute certainties and class for each chunk in best choice.
382  VIABLE_CHOICE_STRUCT *best_choice =
383  (VIABLE_CHOICE_STRUCT *)first_node(best_choices_);
384  ExpandChoice(best_choice, &BestChoice);
385  if (stopper_debug_level >= 2)
386  PrintViableChoice(stderr, "\nFiltering against best choice: ", best_choice);
389  set_rest(best_choices_, delete_d(list_rest(best_choices_),
390  &BestChoice, is_bad));
391  delete is_bad;
392 }
#define NIL_LIST
Definition: oldlist.h:126
LIST delete_d(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:125
#define set_rest(l, cell)
Definition: oldlist.h:222
#define list_rest(l)
Definition: oldlist.h:138
void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice)
Dumps a text representation of the specified Choice to File.
Definition: stopper.cpp:912
#define second_node(l)
Definition: oldlist.h:211
int FreeBadChoice(void *item1, void *item2)
Definition: stopper.cpp:167
int stopper_debug_level
Definition: dict.h:856
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
#define first_node(l)
Definition: oldlist.h:139
void tesseract::Dict::FindClassifierErrors ( FLOAT32  MinRating,
FLOAT32  MaxRating,
FLOAT32  RatingMargin,
FLOAT32  Thresholds[] 
)

Compares the best choice for the current word to the best raw choice to determine which characters were classified incorrectly by the classifier. Then places a separate threshold into Thresholds for each character in the word. If the classifier was correct, MaxRating is placed into Thresholds. If the classifier was incorrect, the avg. match rating (error percentage) of the classifier's incorrect choice minus some margin is placed into thresholds.This can then be used by the caller to try to create a new template for the desired class that will classify the character with a rating better than the threshold value. The match rating placed into Thresholds is never allowed to be below MinRating in order to prevent trying to make overly tight templates. MinRating limits how tight to make a template. MaxRating limits how loose to make a template. RatingMargin denotes the amount of margin to put in template.

Definition at line 394 of file stopper.cpp.

397  {
398  EXPANDED_CHOICE BestRaw;
399  VIABLE_CHOICE Choice;
400  int i, j, Chunk;
401  FLOAT32 AvgRating;
402  int NumErrorChunks;
403 
404  assert (best_choices_ != NIL_LIST);
405  assert (best_raw_choice_ != NULL);
406 
407  ExpandChoice(best_raw_choice_, &BestRaw);
408  Choice = (VIABLE_CHOICE) first_node (best_choices_);
409 
410  for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) {
411  AvgRating = 0.0;
412  NumErrorChunks = 0;
413 
414  for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
415  if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) {
416  AvgRating += BestRaw.ChunkCertainty[Chunk];
417  NumErrorChunks++;
418  }
419  }
420 
421  if (NumErrorChunks > 0) {
422  AvgRating /= NumErrorChunks;
423  *Thresholds = (AvgRating / -certainty_scale) * (1.0 - RatingMargin);
424  }
425  else
426  *Thresholds = MaxRating;
427 
428  if (*Thresholds > MaxRating)
429  *Thresholds = MaxRating;
430  if (*Thresholds < MinRating)
431  *Thresholds = MinRating;
432  }
433 }
CHAR_CHOICE * Blob
Definition: stopper.h:74
UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS]
Definition: stopper.cpp:56
#define NIL_LIST
Definition: oldlist.h:126
UNICHAR_ID Class
Definition: stopper.h:51
#define NULL
Definition: host.h:144
double certainty_scale
Definition: dict.h:845
VIABLE_CHOICE_STRUCT * VIABLE_CHOICE
Definition: stopper.h:86
float FLOAT32
Definition: host.h:111
uinT16 NumChunks
Definition: stopper.h:52
float ChunkCertainty[MAX_NUM_CHUNKS]
Definition: stopper.cpp:55
#define first_node(l)
Definition: oldlist.h:139
bool tesseract::Dict::fragment_state_okay ( UNICHAR_ID  curr_unichar_id,
float  curr_rating,
float  curr_certainty,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
const char *  debug,
int  word_ending,
CHAR_FRAGMENT_INFO char_frag_info 
)

Semi-generic functions used by multiple permuters.

Definition at line 1283 of file permute.cpp.

1287  {
1288  const CHAR_FRAGMENT *this_fragment =
1289  getUnicharset().get_fragment(curr_unichar_id);
1290  const CHAR_FRAGMENT *prev_fragment =
1291  prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL;
1292 
1293  // Print debug info for fragments.
1294  if (debug && (prev_fragment || this_fragment)) {
1295  cprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
1296  getUnicharset().debug_str(curr_unichar_id).string(),
1297  word_ending);
1298  if (prev_fragment) {
1299  cprintf("prev_fragment %s\n", prev_fragment->to_string().string());
1300  }
1301  if (this_fragment) {
1302  cprintf("this_fragment %s\n", this_fragment->to_string().string());
1303  }
1304  }
1305 
1306  char_frag_info->unichar_id = curr_unichar_id;
1307  char_frag_info->fragment = this_fragment;
1308  char_frag_info->rating = curr_rating;
1309  char_frag_info->certainty = curr_certainty;
1310  char_frag_info->num_fragments = 1;
1311  if (prev_fragment && !this_fragment) {
1312  if (debug) tprintf("Skip choice with incomplete fragment\n");
1313  return false;
1314  }
1315  if (this_fragment) {
1316  // We are dealing with a fragment.
1317  char_frag_info->unichar_id = INVALID_UNICHAR_ID;
1318  if (prev_fragment) {
1319  if (!this_fragment->is_continuation_of(prev_fragment)) {
1320  if (debug) tprintf("Non-matching fragment piece\n");
1321  return false;
1322  }
1323  if (this_fragment->is_ending()) {
1324  char_frag_info->unichar_id =
1325  getUnicharset().unichar_to_id(this_fragment->get_unichar());
1326  char_frag_info->fragment = NULL;
1327  if (debug) {
1328  tprintf("Built character %s from fragments\n",
1329  getUnicharset().debug_str(
1330  char_frag_info->unichar_id).string());
1331  }
1332  } else {
1333  if (debug) tprintf("Record fragment continuation\n");
1334  char_frag_info->fragment = this_fragment;
1335  }
1336  // Update certainty and rating.
1337  char_frag_info->rating =
1338  prev_char_frag_info->rating + curr_rating;
1339  char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
1340  char_frag_info->certainty =
1341  MIN(curr_certainty, prev_char_frag_info->certainty);
1342  } else {
1343  if (this_fragment->is_beginning()) {
1344  if (debug) cprintf("Record fragment beginning\n");
1345  } else {
1346  if (debug) {
1347  tprintf("Non-starting fragment piece with no prev_fragment\n");
1348  }
1349  return false;
1350  }
1351  }
1352  }
1353  if (word_ending && char_frag_info->fragment) {
1354  if (debug) tprintf("Word can not end with a fragment\n");
1355  return false;
1356  }
1357  return true;
1358 }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
int num_fragments
Definition: dict.h:40
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
float rating
Definition: dict.h:41
UNICHAR_ID unichar_id
Definition: dict.h:38
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:80
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
const char * string() const
Definition: strngs.cpp:156
bool is_ending() const
Definition: unicharset.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
#define MIN(x, y)
Definition: ndminx.h:28
float certainty
Definition: dict.h:42
const char * get_unichar() const
Definition: unicharset.h:52
bool is_beginning() const
Definition: unicharset.h:87
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
const CHAR_FRAGMENT * fragment
Definition: dict.h:39
static STRING to_string(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.cpp:889
int tesseract::Dict::FreeBadChoice ( void *  item1,
void *  item2 
)

Definition at line 167 of file stopper.cpp.

169  { // EXPANDED_CHOICE *BestChoice
170  int i, j, Chunk;
171  FLOAT32 Threshold;
172  VIABLE_CHOICE Choice = reinterpret_cast<VIABLE_CHOICE>(item1);
173  EXPANDED_CHOICE *BestChoice = reinterpret_cast<EXPANDED_CHOICE *>(item2);
174  Threshold = StopperAmbigThreshold(BestChoice->Choice->AdjustFactor,
175  Choice->AdjustFactor);
176  for (i = 0, Chunk = 0; i < Choice->Length; i++) {
177  for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
178  if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
179  Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
180  Threshold) {
181  if (stopper_debug_level >= 2)
182  PrintViableChoice(stderr, "\nDiscarding bad choice: ", Choice);
183  delete Choice;
184  return true;
185  }
186  }
187  }
188  return false;
189 }
double StopperAmbigThreshold(double f1, double f2)
Definition: dict.h:322
CHAR_CHOICE * Blob
Definition: stopper.h:74
UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS]
Definition: stopper.cpp:56
UNICHAR_ID Class
Definition: stopper.h:51
void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice)
Dumps a text representation of the specified Choice to File.
Definition: stopper.cpp:912
float FLOAT32
Definition: host.h:111
uinT16 NumChunks
Definition: stopper.h:52
int stopper_debug_level
Definition: dict.h:856
VIABLE_CHOICE Choice
Definition: stopper.cpp:54
float ChunkCertainty[MAX_NUM_CHUNKS]
Definition: stopper.cpp:55
FLOAT32 AdjustFactor
Definition: stopper.h:72
float Certainty
Definition: stopper.h:53
WERD_CHOICE * tesseract::Dict::get_top_choice_word ( const BLOB_CHOICE_LIST_VECTOR char_choices)

Return the top choice for each character as the choice for the word.

Definition at line 908 of file permute.cpp.

909  {
911  float certainties[MAX_PERM_LENGTH];
912  top_word->set_permuter(TOP_CHOICE_PERM);
913  for (int x = 0; x < char_choices.length(); x++) {
914  BLOB_CHOICE_IT blob_choice_it;
915  blob_choice_it.set_to_list(char_choices.get(x));
916  BLOB_CHOICE *top_choice = blob_choice_it.data();
917  top_word->append_unichar_id_space_allocated(top_choice->unichar_id(), 1,
918  top_choice->rating(),
919  top_choice->certainty());
920  certainties[x] = top_choice->certainty();
921  }
922  LogNewChoice(1.0, certainties, true, top_word, char_choices);
923  return top_word;
924 }
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T & get(int index) const
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
float certainty() const
Definition: ratngs.h:65
#define MAX_PERM_LENGTH
Definition: permute.h:36
int length() const
Definition: genericvector.h:63
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, char fragment_length, float rating, float certainty)
Definition: ratngs.h:331
void LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[], bool raw_choice, WERD_CHOICE *WordChoice, const BLOB_CHOICE_LIST_VECTOR &blob_choices)
Definition: stopper.cpp:484
void set_permuter(uinT8 perm)
Definition: ratngs.h:261
float rating() const
Definition: ratngs.h:62
int tesseract::Dict::get_top_word_script ( const BLOB_CHOICE_LIST_VECTOR char_choices,
const UNICHARSET unicharset 
)

Definition at line 908 of file dict.cpp.

909  {
910  int max_script = unicharset.get_script_table_size();
911  int *sid = new int[max_script];
912  int x;
913  for (x = 0; x < max_script; x++) sid[x] = 0;
914  for (x = 0; x < char_choices.length(); ++x) {
915  BLOB_CHOICE_IT blob_choice_it(char_choices.get(x));
916  sid[blob_choice_it.data()->script_id()]++;
917  }
918  if (unicharset.han_sid() != unicharset.null_sid()) {
919  // Add the Hiragana & Katakana counts to Han and zero them out.
920  if (unicharset.hiragana_sid() != unicharset.null_sid()) {
921  sid[unicharset.han_sid()] += sid[unicharset.hiragana_sid()];
922  sid[unicharset.hiragana_sid()] = 0;
923  }
924  if (unicharset.katakana_sid() != unicharset.null_sid()) {
925  sid[unicharset.han_sid()] += sid[unicharset.katakana_sid()];
926  sid[unicharset.katakana_sid()] = 0;
927  }
928  }
929  // Note that high script ID overrides lower one on a tie, thus biasing
930  // towards non-Common script (if sorted that way in unicharset file).
931  int max_sid = 0;
932  for (x = 1; x < max_script; x++)
933  if (sid[x] >= sid[max_sid]) max_sid = x;
934  if (sid[max_sid] < char_choices.length() / 2)
935  max_sid = unicharset.null_sid();
936  delete[] sid;
937  return max_sid;
938 }
int get_script_table_size() const
Definition: unicharset.h:718
T & get(int index) const
int null_sid() const
Definition: unicharset.h:752
int length() const
Definition: genericvector.h:63
int hiragana_sid() const
Definition: unicharset.h:758
int han_sid() const
Definition: unicharset.h:757
int katakana_sid() const
Definition: unicharset.h:759
const LIST& tesseract::Dict::getBestChoices ( )
inline

Definition at line 730 of file dict.h.

730 { return best_choices_; }
const Dawg* tesseract::Dict::GetDawg ( int  index) const
inline

Return i-th dawg pointer recorded in the dawgs_ vector.

Definition at line 605 of file dict.h.

605 { return dawgs_[index]; }
const Dawg* tesseract::Dict::GetFixedLengthDawg ( int  word_length) const
inline

Return the pointer to the Dawg that contains words of length word_length.

Definition at line 611 of file dict.h.

611  {
612  if (word_length > max_fixed_length_dawgs_wdlen_) return NULL;
613  assert(dawgs_.size() > word_length);
614  return dawgs_[word_length];
615  }
#define NULL
Definition: host.h:144
int size() const
Definition: genericvector.h:59
const Image* tesseract::Dict::getImage ( ) const
inline

Definition at line 94 of file dict.h.

94  {
95  return image_ptr_;
96  }
Image* tesseract::Dict::getImage ( )
inline

Definition at line 97 of file dict.h.

97  {
98  return image_ptr_;
99  }
const int tesseract::Dict::GetMaxFixedLengthDawgIndex ( ) const
inline

Definition at line 616 of file dict.h.

616  {
617  return max_fixed_length_dawgs_wdlen_;
618  }
const Dawg* tesseract::Dict::GetPuncDawg ( ) const
inline

Return the points to the punctuation dawg.

Definition at line 607 of file dict.h.

607 { return punc_dawg_; }
static NODE_REF tesseract::Dict::GetStartingNode ( const Dawg dawg,
EDGE_REF  edge_ref 
)
inlinestatic

Returns the appropriate next node given the EDGE_REF.

Definition at line 620 of file dict.h.

620  {
621  if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
622  NODE_REF node = dawg->next_node(edge_ref);
623  if (node == 0) node = NO_EDGE; // end of word
624  return node;
625  }
inT64 NODE_REF
Definition: dawg.h:55
const Dawg* tesseract::Dict::GetUnambigDawg ( ) const
inline

Return the points to the unambiguous words dawg.

Definition at line 609 of file dict.h.

609 { return unambig_dawg_; }
const UnicharAmbigs& tesseract::Dict::getUnicharAmbigs ( )
inline

Definition at line 106 of file dict.h.

106  {
107  return getImage()->getCCUtil()->unichar_ambigs;
108  }
const CCUtil * getCCUtil() const
Definition: image.h:29
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:73
const Image * getImage() const
Definition: dict.h:94
const UNICHARSET& tesseract::Dict::getUnicharset ( ) const
inline

Definition at line 100 of file dict.h.

100  {
101  return getImage()->getCCUtil()->unicharset;
102  }
const CCUtil * getCCUtil() const
Definition: image.h:29
UNICHARSET unicharset
Definition: ccutil.h:72
const Image * getImage() const
Definition: dict.h:94
UNICHARSET& tesseract::Dict::getUnicharset ( )
inline

Definition at line 103 of file dict.h.

103  {
104  return getImage()->getCCUtil()->unicharset;
105  }
const CCUtil * getCCUtil() const
Definition: image.h:29
UNICHARSET unicharset
Definition: ccutil.h:72
const Image * getImage() const
Definition: dict.h:94
void tesseract::Dict::go_deeper_dawg_fxn ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
bool  word_ending,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  void_more_args 
)

If the choice being composed so far could be a dictionary word and we have not reached the end of the word keep exploring the char_choices further. Also: – sets hyphen word if needed – if word_ending is true and the word is better than best_choice, copies word to best_choice and logs new word choice

Definition at line 67 of file permdawg.cpp.

71  {
72  DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args);
73  word_ending = (char_choice_index == more_args->end_char_choice_index);
74  int word_index = word->length() - 1;
75 
76  if (ambigs_mode(*limit)) {
77  if (best_choice->rating() < *limit) return;
78  } else {
79  // Prune bad subwords
80  if (more_args->rating_array[word_index] == NO_RATING) {
81  more_args->rating_array[word_index] = word->rating();
82  } else {
83  float permdawg_limit = more_args->rating_array[word_index] *
84  more_args->rating_margin + kPermDawgRatingPad;
85  if (permdawg_limit < word->rating()) {
87  tprintf("early pruned word rating=%4.2f,"
88  " permdawg_limit=%4.2f, word=%s\n", word->rating(),
89  permdawg_limit, word->debug_string().string());
90  }
91  return;
92  }
93  }
94  }
95  // Deal with hyphens
96  if (word_ending && more_args->sought_word_length == kAnyWordLength &&
97  has_hyphen_end(*word) && !ambigs_mode(*limit)) {
98  // Copy more_args->active_dawgs to clean_active_dawgs removing
99  // dawgs of type DAWG_TYPE_PATTERN.
100  DawgInfoVector clean_active_dawgs;
101  const DawgInfoVector &active_dawgs = *(more_args->active_dawgs);
102  for (int i = 0; i < active_dawgs.size(); ++i) {
103  if (dawgs_[active_dawgs[i].dawg_index]->type() != DAWG_TYPE_PATTERN) {
104  clean_active_dawgs += active_dawgs[i];
105  }
106  }
107  if (clean_active_dawgs.size() > 0) {
109  tprintf("new hyphen choice = %s\n", word->debug_string().string());
110  word->set_permuter(more_args->permuter);
111  adjust_word(word, certainties, &char_choices, permute_debug);
112  set_hyphen_word(*word, *(more_args->active_dawgs),
113  *(more_args->constraints));
114  update_best_choice(*word, best_choice);
115  }
116  } else { // Look up char in DAWG
117  // TODO(daria): update the rest of the code that specifies alternative
118  // letter_is_okay_ functions (e.g. TessCharNgram class) to work with
119  // multi-byte unichars and/or unichar ids.
120 
121  // If the current unichar is an ngram first try calling
122  // letter_is_okay() for each unigram it contains separately.
123  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
124  bool checked_unigrams = false;
125  if (getUnicharset().get_isngram(orig_uch_id)) {
127  tprintf("checking unigrams in an ngram %s\n",
128  getUnicharset().debug_str(orig_uch_id).string());
129  }
130  int orig_num_fragments = word->fragment_length(word_index);
131  int num_unigrams = 0;
132  word->remove_last_unichar_id();
133  const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
134  const char *ngram_str_end = ngram_str + strlen(ngram_str);
135  const char *ngram_ptr = ngram_str;
136  bool unigrams_ok = true;
137  // Construct DawgArgs that reflect the current state.
138  DawgInfoVector unigram_active_dawgs = *(more_args->active_dawgs);
139  DawgInfoVector unigram_constraints = *(more_args->constraints);
140  DawgInfoVector unigram_updated_active_dawgs;
141  DawgInfoVector unigram_updated_constraints;
142  DawgArgs unigram_dawg_args(&unigram_active_dawgs,
143  &unigram_constraints,
144  &unigram_updated_active_dawgs,
145  &unigram_updated_constraints, 0.0,
146  more_args->permuter,
147  more_args->sought_word_length,
148  more_args->end_char_choice_index);
149  // Check unigrams in the ngram with letter_is_okay().
150  while (unigrams_ok && ngram_ptr < ngram_str_end) {
151  int step = getUnicharset().step(ngram_ptr);
152  UNICHAR_ID uch_id = (step <= 0) ? INVALID_UNICHAR_ID :
153  getUnicharset().unichar_to_id(ngram_ptr, step);
154  ngram_ptr += step;
155  ++num_unigrams;
156  word->append_unichar_id(uch_id, 1, 0.0, 0.0);
157  unigrams_ok = unigrams_ok && (this->*letter_is_okay_)(
158  &unigram_dawg_args,
159  word->unichar_id(word_index+num_unigrams-1),
160  word_ending && (ngram_ptr == ngram_str_end));
161  (*unigram_dawg_args.active_dawgs) =
162  *(unigram_dawg_args.updated_active_dawgs);
163  (*unigram_dawg_args.constraints) =
164  *(unigram_dawg_args.updated_constraints);
166  tprintf("unigram %s is %s\n",
167  getUnicharset().debug_str(uch_id).string(),
168  unigrams_ok ? "OK" : "not OK");
169  }
170  }
171  // Restore the word and copy the updated dawg state if needed.
172  while (num_unigrams-- > 0) word->remove_last_unichar_id();
174  orig_uch_id, orig_num_fragments, 0.0, 0.0);
175  if (unigrams_ok) {
176  checked_unigrams = true;
177  more_args->permuter = unigram_dawg_args.permuter;
178  *(more_args->updated_active_dawgs) =
179  *(unigram_dawg_args.updated_active_dawgs);
180  *(more_args->updated_constraints) =
181  *(unigram_dawg_args.updated_constraints);
182  }
183  }
184 
185  // Check which dawgs from the dawgs_ vector contain the word
186  // up to and including the current unichar.
187  if (checked_unigrams || (this->*letter_is_okay_)(
188  more_args, word->unichar_id(word_index), word_ending)) {
189  // Add a new word choice
190  if (word_ending) {
192  tprintf("found word = %s\n", word->debug_string().string());
193  }
194  if (ambigs_mode(*limit) &&
195  strcmp(output_ambig_words_file.string(), "") != 0) {
196  if (output_ambig_words_file_ == NULL) {
197  output_ambig_words_file_ =
198  fopen(output_ambig_words_file.string(), "wb+");
199  if (output_ambig_words_file_ == NULL) {
200  tprintf("Failed to open output_ambig_words_file %s\n",
201  output_ambig_words_file.string());
202  exit(1);
203  }
204  }
205  STRING word_str;
206  word->string_and_lengths(&word_str, NULL);
207  word_str += " ";
208  fprintf(output_ambig_words_file_, word_str.string());
209  }
210  WERD_CHOICE *adjusted_word = word;
211  WERD_CHOICE hyphen_tail_word(&getUnicharset());
212  if (hyphen_base_size() > 0) {
213  hyphen_tail_word = *word;
214  remove_hyphen_head(&hyphen_tail_word);
215  adjusted_word = &hyphen_tail_word;
216  }
217  adjusted_word->set_permuter(more_args->permuter);
218  if (!ambigs_mode(*limit)) {
219  adjust_word(adjusted_word, &certainties[hyphen_base_size()],
220  &char_choices, permute_debug);
221  }
222  update_best_choice(*adjusted_word, best_choice);
223  } else { // search the next letter
224  // Make updated_* point to the next entries in the DawgInfoVector
225  // arrays (that were originally created in dawg_permute_and_select)
226  ++(more_args->updated_active_dawgs);
227  ++(more_args->updated_constraints);
228  // Make active_dawgs and constraints point to the updated ones.
229  ++(more_args->active_dawgs);
230  ++(more_args->constraints);
231  permute_choices(debug, char_choices, char_choice_index + 1,
232  prev_char_frag_info, word, certainties, limit,
233  best_choice, attempts_left, more_args);
234  // Restore previous state to explore another letter in this position.
235  --(more_args->updated_active_dawgs);
236  --(more_args->updated_constraints);
237  --(more_args->active_dawgs);
238  --(more_args->constraints);
239  }
240  } else {
242  tprintf("last unichar not OK at index %d in %s\n",
243  word_index, word->debug_string().string());
244  }
245  }
246  }
247 }
int length() const
Definition: ratngs.h:214
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
const STRING debug_string() const
Definition: ratngs.h:373
bool ambigs_mode(float rating_limit)
Returns true if we are operating in ambigs mode.
Definition: dict.h:180
void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length, float rating, float certainty)
Definition: ratngs.cpp:313
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
bool permute_debug
Definition: dict.h:872
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permute.cpp:1437
char * output_ambig_words_file
Definition: dict.h:837
int dawg_debug_level
Definition: dict.h:839
void adjust_word(WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool nonword, float additional_adjust, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:749
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void remove_last_unichar_id()
Definition: ratngs.h:356
const char fragment_length(int index) const
Definition: ratngs.h:227
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:560
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:144
Definition: strngs.h:40
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:166
void remove_hyphen_head(WERD_CHOICE *word) const
Definition: dict.h:137
int step(const char *str) const
Definition: unicharset.cpp:192
void set_hyphen_word(const WERD_CHOICE &word, const DawgInfoVector &active_dawgs, const DawgInfoVector &constraints)
Definition: hyphen.cpp:50
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:294
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, char fragment_length, float rating, float certainty)
Definition: ratngs.h:331
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:122
#define NO_RATING
Definition: dict.h:34
float rating() const
Definition: ratngs.h:231
void set_permuter(uinT8 perm)
Definition: ratngs.h:261
void tesseract::Dict::go_deeper_top_fragments_fxn ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
bool  word_ending,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

While the choice being composed so far could be better than best_choice keeps exploring char_choices. If the end of the word is reached and the word is better than best_choice, copies word to best_choice and logs the new word choice.

go_deeper_top_fragments_fxn

While the choice being composed so far could be better than best_choice keeps exploring char_choices. If the end of the word is reached and the word is better than best_choice, copies word to best_choice and logs the new word choice.

Definition at line 1538 of file permute.cpp.

1542  {
1543  if (word->rating() < *limit) {
1544  if (word_ending) {
1545  if (fragments_debug > 1) {
1546  tprintf("fragments_debug new choice = %s\n",
1547  word->debug_string().string());
1548  }
1549  *limit = word->rating();
1550  adjust_non_word(word, certainties, &char_choices, permute_debug);
1551  update_best_choice(*word, best_choice);
1552  } else { // search the next letter
1553  permute_choices(debug, char_choices, char_choice_index + 1,
1554  prev_char_frag_info, word, certainties, limit,
1555  best_choice, attempts_left, more_args);
1556  }
1557  } else {
1558  if (fragments_debug > 1) {
1559  tprintf("fragments_debug pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
1560  word->debug_string().string(), word->rating(), *limit);
1561  }
1562  }
1563 }
const STRING debug_string() const
Definition: ratngs.h:373
bool permute_debug
Definition: dict.h:872
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permute.cpp:1437
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void adjust_non_word(WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool debug)
Definition: dict.h:719
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:166
int fragments_debug
Definition: dict.h:870
float rating() const
Definition: ratngs.h:231
int tesseract::Dict::good_choice ( const WERD_CHOICE choice)

Returns true if a good answer is found for the unknown blob rating.

bool tesseract::Dict::has_hyphen_end ( UNICHAR_ID  unichar_id,
bool  first_pos 
) const
inline

Check whether the word has a hyphen at the end.

Definition at line 144 of file dict.h.

144  {
145  return (last_word_on_line_ && !first_pos &&
146  unichar_id == hyphen_unichar_id_);
147  }
bool tesseract::Dict::has_hyphen_end ( const WERD_CHOICE word) const
inline

Same as above, but check the unichar at the end of the word.

Definition at line 149 of file dict.h.

149  {
150  int word_index = word.length() - 1;
151  return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
152  }
int length() const
Definition: ratngs.h:214
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:144
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
int tesseract::Dict::hyphen_base_size ( ) const
inline

Size of the base word (the part on the line before) of a hyphenated word.

Definition at line 122 of file dict.h.

122  {
123  return this->hyphenated() ? hyphen_word_->length() : 0;
124  }
int length() const
Definition: ratngs.h:214
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:118
bool tesseract::Dict::hyphenated ( ) const
inline

Returns true if we've recorded the beginning of a hyphenated word.

Definition at line 118 of file dict.h.

118  { return
119  !last_word_on_line_ && hyphen_word_ && GetMaxFixedLengthDawgIndex() < 0;
120  }
const int GetMaxFixedLengthDawgIndex() const
Definition: dict.h:616
void tesseract::Dict::incorporate_segcost ( WERD_CHOICE word)

Incoporate segmentation cost into word rating.

Incorporate segmentation cost into the word rating. This is done through a multiplier wordseg_rating_adjust_factor_ which is determined in bestfirst.cpp during state evaluation. This is not the cleanest way to do this. It would be better to reorganize the SEARCH_STATE to keep track of associated states, or do the rating adjustment outside the permuter in evalaute_state.

Definition at line 409 of file permute.cpp.

409  {
410  if (!word || wordseg_rating_adjust_factor_ <= 0) return;
411 
412  float old_rating = word->rating();
413  float new_rating = old_rating * wordseg_rating_adjust_factor_;
414  word->set_rating(new_rating);
415  if (permute_debug)
416  tprintf("Permute segadjust %f * %f --> %f\n",
417  old_rating, wordseg_rating_adjust_factor_, new_rating);
418 }
void set_rating(float new_val)
Definition: ratngs.h:255
bool permute_debug
Definition: dict.h:872
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
float rating() const
Definition: ratngs.h:231
void tesseract::Dict::init_active_dawgs ( int  sought_word_length,
DawgInfoVector active_dawgs,
bool  ambigs_mode 
) const

Fill the given active_dawgs vector with dawgs that could contain the beginning of the word. If hyphenated() returns true, copy the entries from hyphen_active_dawgs_ instead.

Definition at line 643 of file dict.cpp.

645  {
646  int i;
647  if (sought_word_length != kAnyWordLength) {
648  // Only search one fixed word length dawg.
649  if (sought_word_length <= max_fixed_length_dawgs_wdlen_ &&
650  dawgs_[sought_word_length] != NULL) {
651  *active_dawgs += DawgInfo(sought_word_length, NO_EDGE);
652  }
653  } else if (hyphenated()) {
654  *active_dawgs = hyphen_active_dawgs_;
655  if (dawg_debug_level >= 3) {
656  for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
657  tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
658  hyphen_active_dawgs_[i].dawg_index,
659  hyphen_active_dawgs_[i].ref);
660  }
661  }
662  } else {
663  for (i = 0; i < dawgs_.length(); ++i) {
664  if (dawgs_[i] != NULL && kBeginningDawgsType[(dawgs_[i])->type()] &&
665  !(ambigs_mode && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
666  *active_dawgs += DawgInfo(i, NO_EDGE);
667  if (dawg_debug_level >= 3) {
668  tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
669  }
670  }
671  }
672  }
673 }
#define REFFORMAT
Definition: dawg.h:92
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:118
bool ambigs_mode(float rating_limit)
Returns true if we are operating in ambigs mode.
Definition: dict.h:180
#define NULL
Definition: host.h:144
int dawg_debug_level
Definition: dict.h:839
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int size() const
Definition: genericvector.h:59
int length() const
Definition: genericvector.h:63
void tesseract::Dict::init_constraints ( DawgInfoVector constraints) const

If hyphenated() returns true, copy the entries from hyphen_constraints_ into the given constraints vector.

Definition at line 677 of file dict.cpp.

677  {
678  if (hyphenated()) {
679  *constraints = hyphen_constraints_;
680  if (dawg_debug_level >= 3) {
681  for (int i = 0; i < hyphen_constraints_.size(); ++i) {
682  tprintf("Adding hyphen constraint [%d, " REFFORMAT "]\n",
683  hyphen_constraints_[i].dawg_index,
684  hyphen_constraints_[i].ref);
685  }
686  }
687  }
688 }
#define REFFORMAT
Definition: dawg.h:92
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:118
int dawg_debug_level
Definition: dict.h:839
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int size() const
Definition: genericvector.h:59
void tesseract::Dict::InitChoiceAccum ( )

Initializes the data structures used to keep track the good word choices found for a word.

Definition at line 435 of file stopper.cpp.

435  {
436  BLOB_WIDTH *BlobWidth, *End;
437 
438  if (best_raw_choice_)
439  delete best_raw_choice_;
440  best_raw_choice_ = NULL;
441 
442  if (best_choices_)
443  destroy_nodes(best_choices_, DeleteViableChoiceStruct);
444  best_choices_ = NIL_LIST;
445 
446  if (raw_choices_)
448  raw_choices_ = NIL_LIST;
449 
451 
452  for (BlobWidth = current_segmentation_,
453  End = current_segmentation_ + MAX_NUM_CHUNKS;
454  BlobWidth < End; *BlobWidth++ = 1);
455 
456 }
void DeleteViableChoiceStruct(void *vcs)
Definition: stopper.cpp:59
#define NIL_LIST
Definition: oldlist.h:126
#define NULL
Definition: host.h:144
void EnableChoiceAccum()
Definition: dict.h:345
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:204
uinT8 BLOB_WIDTH
Definition: stopper.h:31
void End()
Definition: dict.cpp:335
#define MAX_NUM_CHUNKS
Definition: states.h:37
int tesseract::Dict::LengthOfShortestAlphaRun ( const WERD_CHOICE WordChoice)

Returns the length of the shortest alpha run in WordChoice.

Definition at line 883 of file stopper.cpp.

883  {
884  int shortest = MAX_INT32;
885  int curr_len = 0;
886  for (int w = 0; w < WordChoice.length(); ++w) {
887  if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) {
888  curr_len++;
889  } else if (curr_len > 0) {
890  if (curr_len < shortest) shortest = curr_len;
891  curr_len = 0;
892  }
893  }
894  if (curr_len > 0 && curr_len < shortest) {
895  shortest = curr_len;
896  } else if (shortest == MAX_INT32) {
897  shortest = 0;
898  }
899  return shortest;
900 }
int length() const
Definition: ratngs.h:214
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
#define MAX_INT32
Definition: host.h:120
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
int tesseract::Dict::LetterIsOkay ( void *  void_dawg_args,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inline

Calls letter_is_okay_ member function.

Definition at line 563 of file dict.h.

564  {
565  return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
566  }
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:560
void tesseract::Dict::Load ( )

Initialize Dict class - load dawgs from [lang].traineddata and user-specified wordlist and parttern list.

Definition at line 219 of file dict.cpp.

219  {
220  STRING name;
221  STRING &lang = getImage()->getCCUtil()->lang;
222 
223  if (dawgs_.length() != 0) this->End();
224 
225  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
226 
229 
230  TessdataManager &tessdata_manager =
232 
233  // Load dawgs_.
234  if (load_punc_dawg && tessdata_manager.SeekToStart(TESSDATA_PUNC_DAWG)) {
235  punc_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
236  DAWG_TYPE_PUNCTUATION, lang, PUNC_PERM,
238  dawgs_ += punc_dawg_;
239  }
240  if (load_system_dawg && tessdata_manager.SeekToStart(TESSDATA_SYSTEM_DAWG)) {
241  dawgs_ += new SquishedDawg(tessdata_manager.GetDataFilePtr(),
242  DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM,
244  }
245  if (load_number_dawg && tessdata_manager.SeekToStart(TESSDATA_NUMBER_DAWG)) {
246  dawgs_ +=
247  new SquishedDawg(tessdata_manager.GetDataFilePtr(),
248  DAWG_TYPE_NUMBER, lang, NUMBER_PERM, dawg_debug_level);
249  }
250  if (load_bigram_dawg && tessdata_manager.SeekToStart(TESSDATA_BIGRAM_DAWG)) {
251  bigram_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
252  DAWG_TYPE_WORD, // doesn't actually matter.
253  lang,
254  COMPOUND_PERM, // doesn't actually matter.
256  }
257  if (load_freq_dawg && tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
258  freq_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
259  DAWG_TYPE_WORD, lang, FREQ_DAWG_PERM,
261  dawgs_ += freq_dawg_;
262  }
263  if (load_unambig_dawg &&
264  tessdata_manager.SeekToStart(TESSDATA_UNAMBIG_DAWG)) {
265  unambig_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
266  DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM,
268  dawgs_ += unambig_dawg_;
269  }
270 
271  if (((STRING &)user_words_suffix).length() > 0) {
272  Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
273  kMaxUserDawgEdges, getUnicharset().size(),
276  name += user_words_suffix;
277  if (!trie_ptr->read_word_list(name.string(), getUnicharset(),
279  tprintf("Error: failed to load %s\n", name.string());
280  exit(1);
281  }
282  dawgs_ += trie_ptr;
283  }
284 
285  if (((STRING &)user_patterns_suffix).length() > 0) {
286  Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
287  kMaxUserDawgEdges, getUnicharset().size(),
289  trie_ptr->initialize_patterns(&(getUnicharset()));
291  name += user_patterns_suffix;
292  if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
293  tprintf("Error: failed to load %s\n", name.string());
294  exit(1);
295  }
296  dawgs_ += trie_ptr;
297  }
298 
299  document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
300  kMaxDocDawgEdges, getUnicharset().size(),
302  dawgs_ += document_words_;
303 
304  // This dawg is temporary and should not be searched by letter_is_ok.
305  pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
306  kMaxDocDawgEdges, getUnicharset().size(),
308 
309  // Load fixed length dawgs if necessary (used for phrase search
310  // for non-space delimited languages).
312  tessdata_manager.SeekToStart(TESSDATA_FIXED_LENGTH_DAWGS)) {
313  ReadFixedLengthDawgs(DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM,
314  dawg_debug_level, tessdata_manager.GetDataFilePtr(),
315  &dawgs_, &max_fixed_length_dawgs_wdlen_);
316  }
317 
318  // Construct a list of corresponding successors for each dawg. Each entry i
319  // in the successors_ vector is a vector of integers that represent the
320  // indices into the dawgs_ vector of the successors for dawg i.
321  successors_.reserve(dawgs_.length());
322  for (int i = 0; i < dawgs_.length(); ++i) {
323  const Dawg *dawg = dawgs_[i];
324  SuccessorList *lst = new SuccessorList();
325  for (int j = 0; j < dawgs_.length(); ++j) {
326  const Dawg *other = dawgs_[j];
327  if (dawg != NULL && other != NULL &&
328  (dawg->lang() == other->lang()) &&
329  kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
330  }
331  successors_ += lst;
332  }
333 }
static void ReadFixedLengthDawgs(DawgType type, const STRING &lang, PermuterType perm, int debug_level, FILE *file, DawgVector *dawg_vec, int *max_wdlen)
Definition: dict.cpp:592
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
char * user_words_suffix
Definition: dict.h:799
bool load_bigram_dawg
Definition: dict.h:811
bool load_number_dawg
Definition: dict.h:807
bool load_unambig_dawg
Definition: dict.h:804
#define NULL
Definition: host.h:144
const char * kApostropheLikeUTF8[]
Definition: unicodes.cpp:48
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
TessdataManager tessdata_manager
Definition: ccutil.h:71
STRING language_data_path_prefix
Definition: ccutil.h:70
const CCUtil * getCCUtil() const
Definition: image.h:29
void reserve(int size)
int dawg_debug_level
Definition: dict.h:839
bool load_system_dawg
Definition: dict.h:802
const char * string() const
Definition: strngs.cpp:156
char * user_patterns_suffix
Definition: dict.h:801
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void LoadEquivalenceList(const char *unichar_strings[])
Definition: dict.cpp:354
STRING lang
Definition: ccutil.h:69
Definition: strngs.h:40
int length() const
Definition: genericvector.h:63
bool load_punc_dawg
Definition: dict.h:806
bool load_fixed_length_dawgs
Definition: dict.h:809
void End()
Definition: dict.cpp:335
bool load_freq_dawg
Definition: dict.h:803
GenericVector< int > SuccessorList
Definition: dawg.h:68
const char * kHyphenLikeUTF8[]
Definition: unicodes.cpp:32
const Image * getImage() const
Definition: dict.h:94
void tesseract::Dict::LoadEquivalenceList ( const char *  unichar_strings[])

Definition at line 354 of file dict.cpp.

354  {
355  equivalent_symbols_.push_back(GenericVectorEqEq<UNICHAR_ID>());
356  const UNICHARSET &unicharset = getUnicharset();
357  GenericVectorEqEq<UNICHAR_ID> *equiv_list = &equivalent_symbols_.back();
358  for (int i = 0; unichar_strings[i] != 0; i++) {
359  UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar_strings[i]);
360  if (unichar_id != INVALID_UNICHAR_ID) {
361  equiv_list->push_back(unichar_id);
362  }
363  }
364 }
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
int push_back(T object)
T & back() const
void tesseract::Dict::LogNewChoice ( FLOAT32  AdjustFactor,
const float  Certainties[],
bool  raw_choice,
WERD_CHOICE WordChoice,
const BLOB_CHOICE_LIST_VECTOR blob_choices 
)

Adds Choice to ChoicesList if the adjusted certainty for Choice is within a reasonable range of the best choice in ChoicesList. The ChoicesList list is kept in sorted order by rating. Duplicates are removed. WordChoice is the new choice for current word. AdjustFactor is an adjustment factor which was applied to choice. Certainties are certainties for each char in new choice. raw_choice indicates whether WordChoice is a raw or best choice.

Definition at line 484 of file stopper.cpp.

488  {
489  LIST ChoicesList;
490  LIST Choices;
491  FLOAT32 Threshold;
492 
493  if (!keep_word_choices_)
494  return;
495 
496  if (raw_choice) {
497  if (!best_raw_choice_) {
498  best_raw_choice_ =
499  NewViableChoice(*WordChoice, AdjustFactor, Certainties);
500  } else if (WordChoice->rating() < best_raw_choice_->Rating) {
501  if (ChoiceSameAs(*WordChoice, best_raw_choice_)) {
502  FillViableChoice(*WordChoice, AdjustFactor, Certainties,
503  best_raw_choice_);
504  } else {
505  delete best_raw_choice_;
506  best_raw_choice_ =
507  NewViableChoice(*WordChoice, AdjustFactor, Certainties);
508  }
509  }
510  if (!save_raw_choices) return;
511  ChoicesList = raw_choices_;
512  } else {
513  ChoicesList = best_choices_;
514  }
515 
516  // Throw out obviously bad choices to save some work.
517  if (ChoicesList != NIL_LIST) {
518  Threshold = StopperAmbigThreshold(BestFactor(ChoicesList), AdjustFactor);
519  if (Threshold > -stopper_ambiguity_threshold_offset)
521  if (WordChoice->certainty() - BestCertainty (ChoicesList) < Threshold) {
522  // Set the rating of the word to be terrible, so that it does not
523  // get chosen as the best choice.
524  if (stopper_debug_level >= 2) {
525  STRING bad_string;
526  WordChoice->string_and_lengths(&bad_string, NULL);
527  tprintf("Discarding choice \"%s\" with an overly low certainty"
528  " %.4f vs best choice certainty %.4f (Threshold: %.4f)\n",
529  bad_string.string(), WordChoice->certainty(),
530  BestCertainty(ChoicesList),
531  Threshold + BestCertainty(ChoicesList));
532  }
533  WordChoice->set_rating(WERD_CHOICE::kBadRating);
534  return;
535  }
536  }
537 
538  // See if a choice with the same text string has already been found.
539  VIABLE_CHOICE NewChoice = NULL;
540  Choices = ChoicesList;
541 
542  iterate(Choices) {
543  if (ChoiceSameAs (*WordChoice, (VIABLE_CHOICE) first_node (Choices))) {
544  if (WordChoice->rating() < BestRating (Choices)) {
545  NewChoice = (VIABLE_CHOICE) first_node (Choices);
546  } else {
547  return;
548  }
549  }
550  }
551 
552  if (NewChoice) {
553  FillViableChoice(*WordChoice, AdjustFactor, Certainties, NewChoice);
554  ChoicesList = delete_d(ChoicesList, NewChoice, is_same_node);
555  } else {
556  NewChoice = NewViableChoice(*WordChoice, AdjustFactor, Certainties);
557  }
558 
559  // Now we know we're gonna save it, so add the expensive copy.
560  NewChoice->SetBlobChoices(blob_choices);
561 
562  ChoicesList = s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings);
563  if (stopper_debug_level >= 2)
564  raw_choice ? PrintViableChoice (stderr, "New Raw Choice: ", NewChoice) :
565  PrintViableChoice (stderr, "New Word Choice: ", NewChoice);
566  if (count (ChoicesList) > tessedit_truncate_wordchoice_log) {
567  Choices =
570  set_rest(Choices, NIL_LIST);
571  }
572 
573  // Update raw_choices_/best_choices_ pointer.
574  if (raw_choice) {
575  raw_choices_ = ChoicesList;
576  } else {
577  best_choices_ = ChoicesList;
578  }
579 }
#define BestFactor(Choices)
Definition: stopper.cpp:68
void set_rating(float new_val)
Definition: ratngs.h:255
double StopperAmbigThreshold(double f1, double f2)
Definition: dict.h:322
void FillViableChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[], VIABLE_CHOICE ViableChoice)
Definition: stopper.cpp:943
void DeleteViableChoiceStruct(void *vcs)
Definition: stopper.cpp:59
int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice)
Definition: stopper.cpp:878
int is_same_node(void *item1, void *item2)
Definition: oldlist.cpp:241
float certainty() const
Definition: ratngs.h:234
void SetBlobChoices(const BLOB_CHOICE_LIST_VECTOR &src_choices)
Definition: stopper.cpp:146
#define NIL_LIST
Definition: oldlist.h:126
#define BestCertainty(Choices)
Definition: stopper.cpp:63
LIST delete_d(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:125
#define NULL
Definition: host.h:144
int tessedit_truncate_wordchoice_log
Definition: dict.h:865
#define set_rest(l, cell)
Definition: oldlist.h:222
#define list_rest(l)
Definition: oldlist.h:138
void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice)
Dumps a text representation of the specified Choice to File.
Definition: stopper.cpp:912
VIABLE_CHOICE_STRUCT * VIABLE_CHOICE
Definition: stopper.h:86
float FLOAT32
Definition: host.h:111
void * nth_cell(LIST var_list, int item_num)
Definition: oldlist.cpp:289
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:204
VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[])
Definition: stopper.cpp:902
double stopper_ambiguity_threshold_offset
Definition: dict.h:863
const char * string() const
Definition: strngs.cpp:156
#define BestRating(Choices)
Definition: stopper.cpp:66
int stopper_debug_level
Definition: dict.h:856
static const float kBadRating
Definition: ratngs.h:188
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool save_raw_choices
Definition: dict.h:864
LIST s_adjoin(LIST var_list, void *variable, int_compare compare)
Definition: oldlist.cpp:384
Definition: strngs.h:40
list_rec * LIST
Definition: baseapi.h:60
int count(LIST var_list)
Definition: oldlist.cpp:108
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:294
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
float rating() const
Definition: ratngs.h:231
void tesseract::Dict::LogNewSegmentation ( PIECES_STATE  BlobWidth)

Updates the blob widths in current_segmentation_ to be the same as provided in BlobWidth. BlobWidth[] contains the number of chunks in each blob in the current segmentation.

Definition at line 463 of file stopper.cpp.

463  {
464  BLOB_WIDTH *Segmentation;
465  for (Segmentation = current_segmentation_; *BlobWidth != 0;
466  BlobWidth++, Segmentation++)
467  *Segmentation = *BlobWidth;
468  *Segmentation = 0;
469 }
uinT8 BLOB_WIDTH
Definition: stopper.h:31
void tesseract::Dict::LogNewSplit ( int  Blob)

Given Blob (the index of the blob that was split), adds 1 chunk to the specified blob for each choice in best_choices_ and for best_raw_choice_.

Definition at line 471 of file stopper.cpp.

471  {
472  LIST Choices;
473  if (best_raw_choice_) AddNewChunk(best_raw_choice_, Blob);
474  Choices = best_choices_;
475  iterate(Choices) {
476  AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
477  }
478  Choices = raw_choices_;
479  iterate(Choices) {
480  AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
481  }
482 }
void AddNewChunk(VIABLE_CHOICE Choice, int Blob)
Definition: stopper.cpp:788
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
VIABLE_CHOICE tesseract::Dict::NewViableChoice ( const WERD_CHOICE WordChoice,
FLOAT32  AdjustFactor,
const float  Certainties[] 
)

Allocates a new viable choice data structure, copies WordChoice, Certainties, and current_segmentation_ into it, returns a pointer to the newly created VIABLE_CHOICE. WordChoice is a choice to be converted to a viable choice. AdjustFactor is a factor used to adjust ratings for WordChoice. Certainties contain certainty for each character in WordChoice.

Definition at line 902 of file stopper.cpp.

904  {
905  int Length = WordChoice.length();
906  assert (Length <= MAX_NUM_CHUNKS && Length > 0);
907  VIABLE_CHOICE NewChoice = new VIABLE_CHOICE_STRUCT(Length);
908  FillViableChoice(WordChoice, AdjustFactor, Certainties, NewChoice);
909  return NewChoice;
910 }
int length() const
Definition: ratngs.h:214
void FillViableChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[], VIABLE_CHOICE ViableChoice)
Definition: stopper.cpp:943
double tesseract::Dict::ngram_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
bool tesseract::Dict::NoDangerousAmbig ( WERD_CHOICE BestChoice,
DANGERR fixpt,
bool  fix_replaceable,
BLOB_CHOICE_LIST_VECTOR Choices,
bool *  modified_blobs 
)

Definition at line 581 of file stopper.cpp.

585  {
586  if (stopper_debug_level > 2) {
587  tprintf("\nRunning NoDangerousAmbig() for %s\n",
588  best_choice->debug_string().string());
589  }
590 
591  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
592  // for each unichar id in BestChoice.
593  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
594  int i;
595  bool modified_best_choice = false;
596  bool ambigs_found = false;
597  // For each position in best_choice:
598  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
599  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
600  // -- look for ambiguities corresponding to wrong_ngram in the list while
601  // adding the following unichar_ids from best_choice to wrong_ngram
602  //
603  // Repeat the above procedure twice: first time look through
604  // ambigs to be replaced and replace all the ambiguities found;
605  // second time look through dangerous ambiguities and construct
606  // ambig_blob_choices with fake a blob choice for each ambiguity
607  // and pass them to dawg_permute_and_select() to search for
608  // ambiguous words in the dictionaries.
609  //
610  // Note that during the execution of the for loop (on the first pass)
611  // if replacements are made the length of best_choice might change.
612  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
613  bool replace = (fix_replaceable && pass == 0);
614  const UnicharAmbigsVector &table = replace ?
616  if (!replace) {
617  // Initialize ambig_blob_choices with lists containing a single
618  // unichar id for the correspoding position in best_choice.
619  // best_choice consisting from only the original letters will
620  // have a rating of 0.0.
621  for (i = 0; i < best_choice->length(); ++i) {
622  BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
623  BLOB_CHOICE_IT lst_it(lst);
624  // TODO(rays/antonova) Should these BLOB_CHOICEs use real xheights
625  // or are these fake ones good enough?
626  lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
627  0.0, 0.0, -1, -1, -1, 0, 1, false));
628  ambig_blob_choices.push_back(lst);
629  }
630  }
631  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
632  int wrong_ngram_index;
633  int next_index;
634  int blob_index = 0;
635  for (i = 0; i < best_choice->length(); ++i) {
636  if (i > 0) blob_index += best_choice->fragment_length(i-1);
637  UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
638  if (stopper_debug_level > 2) {
639  tprintf("Looking for %s ngrams starting with %s:\n",
640  replace ? "replaceable" : "ambiguous",
641  getUnicharset().debug_str(curr_unichar_id).string());
642  }
643  wrong_ngram_index = 0;
644  wrong_ngram[wrong_ngram_index] = curr_unichar_id;
645  if (curr_unichar_id == INVALID_UNICHAR_ID ||
646  curr_unichar_id >= table.size() ||
647  table[curr_unichar_id] == NULL) {
648  continue; // there is no ambig spec for this unichar id
649  }
650  AmbigSpec_IT spec_it(table[curr_unichar_id]);
651  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
652  const AmbigSpec *ambig_spec = spec_it.data();
653  wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
654  int compare = UnicharIdArrayUtils::compare(wrong_ngram,
655  ambig_spec->wrong_ngram);
656  if (stopper_debug_level > 2) {
657  tprintf("candidate ngram: ");
659  tprintf("current ngram from spec: ");
660  UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
661  tprintf("comparison result: %d\n", compare);
662  }
663  if (compare == 0) {
664  // Record the place where we found an ambiguity.
665  if (fixpt != NULL) {
666  fixpt->push_back(DANGERR_INFO(
667  blob_index, blob_index+wrong_ngram_index, replace,
668  getUnicharset().get_isngram(ambig_spec->correct_ngram_id)));
669  if (stopper_debug_level > 1) {
670  tprintf("fixpt+=(%d %d %d %d)\n", blob_index,
671  blob_index+wrong_ngram_index, false,
672  getUnicharset().get_isngram(
673  ambig_spec->correct_ngram_id));
674  }
675  }
676 
677  if (replace) {
678  if (stopper_debug_level > 2) {
679  tprintf("replace ambiguity with: ");
681  ambig_spec->correct_fragments, getUnicharset());
682  }
683  ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
684  ambig_spec->correct_ngram_id,
685  best_choice, blob_choices, modified_blobs);
686  modified_best_choice = true;
687  } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
688  // We found dang ambig - update ambig_blob_choices.
689  if (stopper_debug_level > 2) {
690  tprintf("found ambiguity: ");
692  ambig_spec->correct_fragments, getUnicharset());
693  }
694  ambigs_found = true;
695  for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
696  ++tmp_index) {
697  // Add a blob choice for the corresponding fragment of the
698  // ambiguity. These fake blob choices are initialized with
699  // negative ratings (which are not possible for real blob
700  // choices), so that dawg_permute_and_select() considers any
701  // word not consisting of only the original letters a better
702  // choice and stops searching for alternatives once such a
703  // choice is found.
704  BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
705  bc_it.add_to_end(new BLOB_CHOICE(
706  ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
707  -1, -1, -1, 0, 1, false));
708  }
709  }
710  spec_it.forward();
711  } else if (compare == -1) {
712  if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
713  ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
714  // Add the next unichar id to wrong_ngram and keep looking for
715  // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
716  wrong_ngram[++wrong_ngram_index] =
717  best_choice->unichar_id(next_index);
718  } else {
719  break; // no more matching ambigs in this AMBIG_SPEC_LIST
720  }
721  } else {
722  spec_it.forward();
723  }
724  } // end searching AmbigSpec_LIST
725  } // end searching best_choice
726  } // end searching replace and dangerous ambigs
727 
728  // If any ambiguities were found permute the constructed ambig_blob_choices
729  // to see if an alternative dictionary word can be found.
730  if (ambigs_found) {
731  if (stopper_debug_level > 2) {
732  tprintf("\nResulting ambig_blob_choices:\n");
733  for (i = 0; i < ambig_blob_choices.length(); ++i) {
734  print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
735  tprintf("\n");
736  }
737  }
738  WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
739  ambigs_found = (alt_word->rating() < 0.0);
740  if (ambigs_found) {
741  if (stopper_debug_level >= 1) {
742  tprintf ("Stopper: Possible ambiguous word = %s\n",
743  alt_word->debug_string().string());
744  }
745  if (fixpt != NULL) {
746  // Note: Currently character choices combined from fragments can only
747  // be generated by NoDangrousAmbigs(). This code should be updated if
748  // the capability to produce classifications combined from character
749  // fragments is added to other functions.
750  int orig_i = 0;
751  for (i = 0; i < alt_word->length(); ++i) {
752  bool replacement_is_ngram =
753  getUnicharset().get_isngram(alt_word->unichar_id(i));
754  int end_i = orig_i + alt_word->fragment_length(i) - 1;
755  if (alt_word->fragment_length(i) > 1 ||
756  (orig_i == end_i && replacement_is_ngram)) {
757  fixpt->push_back(DANGERR_INFO(orig_i, end_i, true,
758  replacement_is_ngram));
759  if (stopper_debug_level > 1) {
760  tprintf("fixpt->dangerous+=(%d %d %d %d)\n", orig_i, end_i,
761  true, replacement_is_ngram);
762  }
763  }
764  orig_i += alt_word->fragment_length(i);
765  }
766  }
767  }
768  delete alt_word;
769  }
770  if (output_ambig_words_file_ != NULL) {
771  fprintf(output_ambig_words_file_, "\n");
772  }
773 
774  ambig_blob_choices.delete_data_pointers();
775  return !ambigs_found;
776 }
int length() const
Definition: ratngs.h:214
void delete_data_pointers()
int UNICHAR_ID
Definition: unichar.h:31
const STRING debug_string() const
Definition: ratngs.h:373
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:427
#define MAX_AMBIG_SIZE
Definition: ambigs.h:30
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:97
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T & get(int index) const
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:150
int push_back(T object)
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:511
static int compare(const UNICHAR_ID array1[], const UNICHAR_ID array2[])
Definition: ambigs.h:62
const char * string() const
Definition: strngs.cpp:156
int stopper_debug_level
Definition: dict.h:856
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const char fragment_length(int index) const
Definition: ratngs.h:227
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, BLOB_CHOICE_LIST_VECTOR *blob_choices, bool *modified_blobs)
Definition: stopper.cpp:802
int length() const
Definition: genericvector.h:63
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:139
const UnicharAmbigs & getUnicharAmbigs()
Definition: dict.h:106
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, int sought_word_length, int end_char_choice_index)
Definition: permdawg.cpp:263
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:151
float rating() const
Definition: ratngs.h:231
UNICHAR_ID tesseract::Dict::NormalizeUnicharIdForMatch ( UNICHAR_ID  unichar_id) const

Definition at line 368 of file dict.cpp.

368  {
369  for (int i = 0; i < equivalent_symbols_.size(); i++) {
370  if (equivalent_symbols_[i].contains(unichar_id)) {
371  return equivalent_symbols_[i][0];
372  }
373  }
374  return unichar_id;
375 }
int size() const
Definition: genericvector.h:59
const int tesseract::Dict::NumDawgs ( ) const
inline

Return the number of dawgs in the dawgs_ vector.

Definition at line 603 of file dict.h.

603 { return dawgs_.size(); }
int size() const
Definition: genericvector.h:59
WERD_CHOICE * tesseract::Dict::permute_all ( const BLOB_CHOICE_LIST_VECTOR char_choices,
const WERD_CHOICE best_choice,
WERD_CHOICE raw_choice 
)

Definition at line 331 of file permute.cpp.

333  {
334  WERD_CHOICE *result1 = NULL;
335  WERD_CHOICE *result2 = NULL;
336  BOOL8 any_alpha;
337  float top_choice_rating_limit = best_choice->rating();
338  int word_script_id = get_top_word_script(char_choices, getUnicharset());
339 
340  PermuterState permuter_state;
341  if (getUnicharset().han_sid() != getUnicharset().null_sid() &&
342  word_script_id == getUnicharset().han_sid()) {
343  permuter_state.Init(char_choices, getUnicharset(), 1.0f, permute_debug);
344 
345  result1 = get_top_choice_word(char_choices);
346 
347  // Note that we no longer need the returned word from these permuters,
348  // except to delete the memory. The word choice from all permutations
349  // is returned by permuter_state.GetpermutedWord() at the end.
351  result2 = permute_fixed_length_words(char_choices, &permuter_state);
352  delete result2;
353  }
354  if (permute_chartype_word) {
355  result2 = permute_chartype_words(char_choices, &permuter_state);
356  delete result2;
357  }
358  if (permute_script_word) {
359  result2 = permute_script_words(char_choices, &permuter_state);
360  delete result2;
361  }
362 
363  float certainties[MAX_PERM_LENGTH];
364  float adjust_factor;
365  result2 = permuter_state.GetPermutedWord(certainties, &adjust_factor);
366  LogNewChoice(adjust_factor, certainties, false, result2, char_choices);
367  result1 = get_best_delete_other(result1, result2);
368 
370  } else {
371  result1 = permute_top_choice(char_choices, &top_choice_rating_limit,
372  raw_choice, &any_alpha);
373  if (result1 == NULL)
374  return (NULL);
375  if (permute_only_top)
376  return result1;
377 
378  if (permute_chartype_word) {
379  permuter_state.Init(char_choices, getUnicharset(),
381  result2 = permute_chartype_words(char_choices, &permuter_state);
382  result1 = get_best_delete_other(result1, result2);
383  }
384 
385  // Permute character fragments if necessary.
386  if (result1 == NULL || result1->fragment_mark()) {
387  result2 = top_fragments_permute_and_select(char_choices,
388  top_choice_rating_limit);
389  result1 = get_best_delete_other(result1, result2);
390  }
391 
392  result2 = dawg_permute_and_select(char_choices, best_choice->rating());
393  result1 = get_best_delete_other(result1, result2);
394 
395  result2 = permute_compound_words(char_choices, best_choice->rating());
396  result1 = get_best_delete_other(result1, result2);
397  }
398  return result1;
399 }
int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices, const UNICHARSET &unicharset)
Definition: dict.cpp:908
WERD_CHOICE * top_fragments_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permute.cpp:1367
unsigned char BOOL8
Definition: host.h:113
bool segment_segcost_rating
Definition: dict.h:878
#define NULL
Definition: host.h:144
WERD_CHOICE * permute_fixed_length_words(const BLOB_CHOICE_LIST_VECTOR &char_choices, PermuterState *permuter_state)
Definition: permute.cpp:430
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
bool permute_debug
Definition: dict.h:872
#define f(xc, yc)
Definition: imgscale.cpp:39
void incorporate_segcost(WERD_CHOICE *word)
Incoporate segmentation cost into word rating.
Definition: permute.cpp:409
double segment_penalty_garbage
Definition: dict.h:835
WERD_CHOICE * permute_compound_words(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permute.cpp:799
WERD_CHOICE * permute_top_choice(const BLOB_CHOICE_LIST_VECTOR &char_choices, float *rating_limit, WERD_CHOICE *raw_choice, BOOL8 *any_alpha)
Definition: permute.cpp:934
WERD_CHOICE * permute_chartype_words(const BLOB_CHOICE_LIST_VECTOR &char_choices, PermuterState *permuter_state)
checks for consistency in character property (eg. alpah, digit, punct)
Definition: permute.cpp:542
WERD_CHOICE * permute_script_words(const BLOB_CHOICE_LIST_VECTOR &char_choices, PermuterState *permuter_state)
Definition: permute.cpp:669
bool fragment_mark() const
Definition: ratngs.h:241
bool permute_fixed_length_dawg
Definition: dict.h:888
#define MAX_PERM_LENGTH
Definition: permute.h:36
WERD_CHOICE * get_top_choice_word(const BLOB_CHOICE_LIST_VECTOR &char_choices)
Definition: permute.cpp:908
bool permute_script_word
Definition: dict.h:876
bool permute_chartype_word
Definition: dict.h:890
WERD_CHOICE * get_best_delete_other(WERD_CHOICE *choice1, WERD_CHOICE *choice2)
Definition: permute.cpp:74
bool permute_only_top
Definition: dict.h:910
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, int sought_word_length, int end_char_choice_index)
Definition: permdawg.cpp:263
float rating() const
Definition: ratngs.h:231
void LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[], bool raw_choice, WERD_CHOICE *WordChoice, const BLOB_CHOICE_LIST_VECTOR &blob_choices)
Definition: stopper.cpp:484
bool tesseract::Dict::permute_characters ( const BLOB_CHOICE_LIST_VECTOR char_choices,
WERD_CHOICE best_choice,
WERD_CHOICE raw_choice 
)

permute_characters

Permute these characters together according to each of the different permuters that are enabled. Returns true if best_choice was updated.

Definition at line 765 of file permute.cpp.

767  {
768  if (permute_debug) {
769  tprintf("\n\n\n##### Permute_Characters #######\n");
770  print_char_choices_list("\n==> Input CharChoices", char_choices,
772  tprintf("\n");
773  }
774 
775  if (char_choices.length() == 1 &&
776  get_top_choice_uid(char_choices.get(0)) == 0) return false;
777  WERD_CHOICE *this_choice = permute_all(char_choices, best_choice, raw_choice);
778 
779  if (this_choice && this_choice->rating() < best_choice->rating()) {
780  *best_choice = *this_choice;
781 
782  if (permute_debug) {
783  best_choice->print("\n**** Populate BestChoice");
784  cprintf("populate best_choice\n\t%s\n",
785  best_choice->debug_string().string());
786  }
787  delete this_choice;
788  return true;
789  }
790  delete this_choice;
791  return false;
792 }
UNICHAR_ID get_top_choice_uid(BLOB_CHOICE_LIST *blob_list)
Definition: permute.cpp:99
const STRING debug_string() const
Definition: ratngs.h:373
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T & get(int index) const
bool permute_debug
Definition: dict.h:872
WERD_CHOICE * permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices, const WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice)
Definition: permute.cpp:331
int segment_debug
Definition: dict.h:871
const char * string() const
Definition: strngs.cpp:156
const void print() const
Definition: ratngs.h:406
void print_char_choices_list(const char *msg, const BLOB_CHOICE_LIST_VECTOR &char_choices, const UNICHARSET &current_unicharset, BOOL8 detailed)
Definition: ratngs.cpp:610
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int length() const
Definition: genericvector.h:63
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
float rating() const
Definition: ratngs.h:231
WERD_CHOICE * tesseract::Dict::permute_chartype_words ( const BLOB_CHOICE_LIST_VECTOR char_choices,
PermuterState permuter_state 
)

checks for consistency in character property (eg. alpah, digit, punct)

Definition at line 542 of file permute.cpp.

544  {
545 
546  if (char_choices.length() >= MAX_PERM_LENGTH)
547  return NULL;
548  // Store main character property of top choice at every position
549  char pos_chartypes[MAX_PERM_LENGTH];
550  char word_type = top_word_chartype(char_choices, pos_chartypes);
551  if (word_type == 0 || word_type == 'p')
552  return NULL; // skip if word type is punctuation or unknown
553  if (permute_debug) {
554  tprintf("\n\nPermuteCharType[%c]\n", word_type);
555  print_char_choices_list("", char_choices, getUnicharset(), true);
556  }
557 
558  WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset());
559  BLOB_CHOICE_IT blob_choice_it;
560  const UNICHARSET& unicharset = getUnicharset();
561  bool replaced = false; // has any character choice been replaced
562  int prev_unambig_type = 0; // the last chartype of an unambiguous char
563  float certainties[MAX_PERM_LENGTH + 1];
564  for (int x = 0; x < char_choices.length(); ++x) {
565  BLOB_CHOICE_LIST* pos_choice = char_choices.get(x);
566  UNICHAR_ID unichar_id = get_top_choice_uid(pos_choice);
567  if (unichar_id == 0) {
568  delete current_word;
569  return NULL;
570  }
571  blob_choice_it.set_to_list(pos_choice);
572  BLOB_CHOICE *first_choice = blob_choice_it.data();
573  ASSERT_HOST(first_choice != NULL);
574 
575  const UnicharIdVector* ambig_uids =
577  bool is_ambiguous = (ambig_uids != NULL);
578  bool is_punct = unicharset.get_ispunctuation(unichar_id);
579  bool is_consistent = is_punct ||
580  unicharset.get_chartype(unichar_id) == prev_unambig_type ||
581  unicharset.get_chartype(unichar_id) == word_type;
582  bool is_fragment = getUnicharset().get_fragment(unichar_id) != NULL;
583  if (permute_debug)
584  tprintf("char[%d]:%s is_ambig %c is_punct %c is_consistent %c\n",
585  x, unicharset.id_to_unichar(unichar_id),
586  is_ambiguous?'T':'F', is_punct?'T':'F', is_consistent?'T':'F');
587 
588  if (is_fragment) {
589  // Ignore any fragmented characters by skipping them to next choice
590  // (originally first choice).
591  first_choice = get_nth_choice(pos_choice, 1);
592  ASSERT_HOST(first_choice != NULL);
593  } else if (is_ambiguous && !is_consistent) {
594  // Check every confusable blob choice where the top choice is inconsistent
595  // with the character type of the previous unambiguous character.
596  if (permute_debug) {
597  tprintf("Checking %s r%g PrevCharType %c\n",
598  unicharset.id_to_unichar(unichar_id),
599  first_choice->rating(), prev_unambig_type);
600  print_ratings_list("\t", pos_choice, getUnicharset());
601  }
602  BLOB_CHOICE* c_it = NULL;
603  if (c_it == NULL) {
604  c_it = find_choice_by_type(pos_choice, word_type, unicharset);
605  }
606 
607  // Prefer a character choice whose type is the same as the previous
608  // unambiguous character and the confusion appears in the ambig list.
609  if (c_it == NULL && prev_unambig_type > 0) {
610  c_it = find_choice_by_type(pos_choice, prev_unambig_type, unicharset);
611  if (c_it &&
612  UnicharIdArrayUtils::find_in(*ambig_uids, c_it->unichar_id()) < 0)
613  c_it = NULL;
614  }
615 
616  // Otherwise, perfer a punctuation
617  if (c_it == NULL) {
618  c_it = find_choice_by_type(pos_choice, 'p', unicharset);
619  if (c_it &&
620  UnicharIdArrayUtils::find_in(*ambig_uids, c_it->unichar_id()) < 0)
621  c_it = NULL;
622  }
623 
624  // save any preference other than the top choice
625  if (c_it != NULL) {
626  if (permute_debug) {
627  tprintf("Replacing %s r%g ==> %s r%g\n",
628  unicharset.id_to_unichar(unichar_id), first_choice->rating(),
629  unicharset.id_to_unichar(c_it->unichar_id()), c_it->rating());
630  tprintf("\n\nPermuteCharType[%c]\n", word_type);
631  print_char_choices_list("", char_choices, getUnicharset(), false);
632  }
633  if (permuter_state)
634  permuter_state->AddPreference(x, c_it, segment_reward_chartype);
635  first_choice = c_it;
636  replaced = true;
637  }
638  } else if (!is_ambiguous && !is_punct) {
639  // keep the last unambiguous character type
640  prev_unambig_type = pos_chartypes[x];
641  }
642  current_word->append_unichar_id(first_choice->unichar_id(), 1,
643  first_choice->rating(),
644  first_choice->certainty());
645  certainties[x] = first_choice->certainty();
646  }
647  // All permuter choices should go through adjust_non_word so the choice
648  // rating would be adjusted on the same scale.
649  adjust_non_word(current_word, certainties, &char_choices, permute_debug);
650  if (replaced) {
651  // Apply a reward multiplier on rating if an chartype permutation is made.
652  float rating = current_word->rating();
653  current_word->set_rating(rating * segment_reward_chartype);
654  if (permute_debug)
655  current_word->print("<== permute_chartype_word **");
656  }
657  return current_word;
658 }
UNICHAR_ID get_top_choice_uid(BLOB_CHOICE_LIST *blob_list)
Definition: permute.cpp:99
void set_rating(float new_val)
Definition: ratngs.h:255
BLOB_CHOICE * find_choice_by_type(BLOB_CHOICE_LIST *blob_choices, char target_type, const UNICHARSET &unicharset)
Definition: permute.cpp:181
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
char top_word_chartype(const BLOB_CHOICE_LIST_VECTOR &char_choices, char *pos_chartypes)
Definition: permute.cpp:512
double segment_reward_chartype
Definition: dict.h:892
void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length, float rating, float certainty)
Definition: ratngs.cpp:313
GenericVector< UNICHAR_ID > UnicharIdVector
Definition: ambigs.h:34
BLOB_CHOICE * get_nth_choice(BLOB_CHOICE_LIST *blob_list, int n)
Definition: permute.cpp:91
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:420
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T & get(int index) const
bool permute_debug
Definition: dict.h:872
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
float certainty() const
Definition: ratngs.h:65
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:511
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
const void print() const
Definition: ratngs.h:406
void print_char_choices_list(const char *msg, const BLOB_CHOICE_LIST_VECTOR &char_choices, const UNICHARSET &current_unicharset, BOOL8 detailed)
Definition: ratngs.cpp:610
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:502
void adjust_non_word(WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool debug)
Definition: dict.h:719
#define MAX_PERM_LENGTH
Definition: permute.h:36
int length() const
Definition: genericvector.h:63
static int find_in(const UnicharIdVector &uid_vec, const UNICHAR_ID uid)
Definition: ambigs.h:77
#define ASSERT_HOST(x)
Definition: errcode.h:84
const UnicharAmbigs & getUnicharAmbigs()
Definition: dict.h:106
const UnicharIdVector * OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const
Definition: ambigs.h:167
float rating() const
Definition: ratngs.h:231
float rating() const
Definition: ratngs.h:62
void tesseract::Dict::permute_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

permute_choices

Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST with the given char_choice_index in char_choices.

Definition at line 1437 of file permute.cpp.

1447  {
1448  if (debug) {
1449  tprintf("%s permute_choices: char_choice_index=%d"
1450  " limit=%g rating=%g, certainty=%g word=%s\n",
1451  debug, char_choice_index, *limit, word->rating(),
1452  word->certainty(), word->debug_string().string());
1453  }
1454  if (char_choice_index < char_choices.length()) {
1455  BLOB_CHOICE_IT blob_choice_it;
1456  blob_choice_it.set_to_list(char_choices.get(char_choice_index));
1457  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
1458  blob_choice_it.forward()) {
1459  (*attempts_left)--;
1460  append_choices(debug, char_choices, *(blob_choice_it.data()),
1461  char_choice_index, prev_char_frag_info, word,
1462  certainties, limit, best_choice, attempts_left, more_args);
1463  if (*attempts_left <= 0) {
1464  if (debug) tprintf("permute_choices(): attempts_left is 0\n");
1465  break;
1466  }
1467  }
1468  }
1469 }
const STRING debug_string() const
Definition: ratngs.h:373
float certainty() const
Definition: ratngs.h:234
T & get(int index) const
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int length() const
Definition: genericvector.h:63
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permute.cpp:1479
float rating() const
Definition: ratngs.h:231
WERD_CHOICE * tesseract::Dict::permute_compound_words ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float  rating_limit 
)

permute_compound_words

Return the top choice for each character as the choice for the word.

Definition at line 799 of file permute.cpp.

801  {
802  BLOB_CHOICE *first_choice;
803  WERD_CHOICE *best_choice = NULL;
804  WERD_CHOICE current_word(&getUnicharset(), MAX_WERD_LENGTH);
805  int first_index = 0;
806  int x;
807  BLOB_CHOICE_IT blob_choice_it;
808 
809  if (char_choices.length() > MAX_WERD_LENGTH) {
810  WERD_CHOICE *bad_word_choice = new WERD_CHOICE(&getUnicharset());
811  bad_word_choice->make_bad();
812  return bad_word_choice;
813  }
814 
815  UNICHAR_ID slash = getUnicharset().unichar_to_id("/");
816  UNICHAR_ID dash = getUnicharset().unichar_to_id("-");
817  for (x = 0; x < char_choices.length(); ++x) {
818  blob_choice_it.set_to_list(char_choices.get(x));
819  first_choice = blob_choice_it.data();
820  if (first_choice->unichar_id() == slash ||
821  first_choice->unichar_id() == dash) {
822  if (x > first_index) {
823  if (segment_debug)
824  cprintf ("Hyphenated word found\n");
825  permute_subword(char_choices, rating_limit, first_index,
826  x - 1, &current_word);
827  if (current_word.rating() > rating_limit)
828  break;
829  }
830  // Append hyphen/slash separator to current_word.
831  current_word.append_unichar_id_space_allocated(
832  first_choice->unichar_id(), 1,
833  first_choice->rating(), first_choice->certainty());
834 
835  first_index = x + 1; // update first_index
836  }
837  }
838 
839  if (first_index > 0 && first_index < x &&
840  current_word.rating() <= rating_limit) {
841  permute_subword(char_choices, rating_limit, first_index,
842  x - 1, &current_word);
843  best_choice = new WERD_CHOICE(current_word);
844  best_choice->set_permuter(COMPOUND_PERM);
845  }
846  return (best_choice);
847 }
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:321
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T & get(int index) const
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
int segment_debug
Definition: dict.h:871
float certainty() const
Definition: ratngs.h:65
int length() const
Definition: genericvector.h:63
void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, int start, int end, WERD_CHOICE *current_word)
Definition: permute.cpp:859
#define MAX_WERD_LENGTH
Definition: dict.h:33
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
void set_permuter(uinT8 perm)
Definition: ratngs.h:261
float rating() const
Definition: ratngs.h:62
WERD_CHOICE * tesseract::Dict::permute_fixed_length_words ( const BLOB_CHOICE_LIST_VECTOR char_choices,
PermuterState permuter_state 
)

Find permutations matching a list of fixed-char-length dawgs The bestchoice based on this permuter alone is returned. Alternatively, non-conflicting changes can be combined through permuter_state.

Perform search on fixed-length dictionaries within a word. This is used for non-space delimited languages like CJK when a "word" corresponds to a "phrase" consisted of multiple short words. It iterates over every character position looking for longest matches against a set of fixed-length dawgs. Each dictionary hit is rewarded with a rating bonus. Note: this is very slow as it is performed on every segmentation state.

Definition at line 430 of file permute.cpp.

432  {
433  if (permute_debug)
434  print_char_choices_list("\n\nPermute FixedLength Word",
435  char_choices, getUnicharset(), false);
436  WERD_CHOICE* best_choice =
437  new WERD_CHOICE(&getUnicharset(), char_choices.length());
438  const int max_dict_len = max_fixed_length_dawgs_wdlen_;
439  const int min_dict_len = 2;
440  char posstr[256];
441  int match_score = 0;
442  int anchor_pos = 0;
443  while (anchor_pos < char_choices.length()) {
444  // search from longest phrase to shortest, stop when we find a match
445  WERD_CHOICE* part_choice = NULL;
446  int step = max_dict_len;
447  while (step >= min_dict_len) {
448  int end_pos = anchor_pos + step - 1;
449  if (end_pos < char_choices.length()) {
450  part_choice = dawg_permute_and_select(char_choices,
451  200.0, // rate limit
452  step,
453  anchor_pos);
454  if (part_choice->length() == step) {
455  if (permute_debug)
456  tprintf("match found at pos=%d len=%d\n%s\n", anchor_pos, step,
457  part_choice->unichar_string().string());
458  break;
459  }
460  delete part_choice;
461  part_choice = NULL;
462  }
463  step--;
464  }
465 
466  if (part_choice && step > 1) { // found lexicon match
467  get_posstr_from_choice(char_choices, part_choice, anchor_pos, posstr);
468  float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.length());
469  if (permuter_state)
470  permuter_state->AddPreference(anchor_pos, posstr, adjust_factor);
471  match_score += step - 1; // single chars don't count
472  if (permute_debug)
473  tprintf("Promote word rating %d-len%d\n%s\n", anchor_pos, step,
474  part_choice->unichar_string().string());
475  } else { // no lexicon match
476  step = 1;
477  part_choice = get_choice_from_posstr(&getUnicharset(), char_choices,
478  anchor_pos, "0", NULL);
479  if (permute_debug)
480  tprintf("Single char %d %s\n", anchor_pos,
481  part_choice->unichar_string().string());
482  }
483  if (part_choice && part_choice->length() > 0)
484  (*best_choice) += (*part_choice);
485  if (part_choice) delete part_choice;
486  anchor_pos += step;
487  }
488 
489  if (match_score > 0) {
490  float adjust_factor = pow(0.8, // 1.0/segment_penalty_dict_nonword,
491  match_score * 2.0 / char_choices.length());
492  float adjusted_score = best_choice->rating() * adjust_factor;
493  if (permute_debug)
494  tprintf("Adjusting score %f @ %d -> %f\n",
495  best_choice->rating(), match_score, adjusted_score);
496  best_choice->set_rating(adjusted_score);
497  }
498  if (permute_debug)
499  tprintf("Found Best CJK word %f: %s\n",
500  best_choice->rating(), best_choice->unichar_string().string());
501  return best_choice;
502 }
int length() const
Definition: ratngs.h:214
const STRING & unichar_string() const
Definition: ratngs.h:395
void set_rating(float new_val)
Definition: ratngs.h:255
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
bool permute_debug
Definition: dict.h:872
void get_posstr_from_choice(const BLOB_CHOICE_LIST_VECTOR &char_choices, WERD_CHOICE *word_choice, int start_pos, char *pos_str)
Definition: permute.cpp:161
const char * string() const
Definition: strngs.cpp:156
void print_char_choices_list(const char *msg, const BLOB_CHOICE_LIST_VECTOR &char_choices, const UNICHARSET &current_unicharset, BOOL8 detailed)
Definition: ratngs.cpp:610
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int length() const
Definition: genericvector.h:63
WERD_CHOICE * get_choice_from_posstr(const UNICHARSET *unicharset, const BLOB_CHOICE_LIST_VECTOR &char_choices, int start_pos, const char *pos_str, float *certainties)
Definition: permute.cpp:129
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, int sought_word_length, int end_char_choice_index)
Definition: permdawg.cpp:263
float rating() const
Definition: ratngs.h:231
WERD_CHOICE * tesseract::Dict::permute_script_words ( const BLOB_CHOICE_LIST_VECTOR char_choices,
PermuterState permuter_state 
)

Checks for script-consistent permutations. Similar to fixed-length permuter, the best choice is returned by the function, but the combined changes are also recorded into permuter_state.

Try flipping characters in a word to get better script consistency. Similar to how upper/lower case checking is done in top_choice_permuter, this permuter tries to suggest a more script-consistent choice AND modifies the rating. So it combines both the case_ok check and adjust_non_word functionality. However, instead of penalizing an inconsistent word with a > 1 multiplier, we reward the script-consistent choice with a < 1 multiplier.

Definition at line 669 of file permute.cpp.

671  {
672  if (char_choices.length() >= MAX_WERD_LENGTH)
673  return NULL;
674 
675  int word_sid = get_top_word_script(char_choices, getUnicharset());
676  if (word_sid == getUnicharset().null_sid())
677  return NULL;
678 
679  if (permute_debug) {
680  tprintf("\n\nPermuteScript %s\n",
681  getUnicharset().get_script_from_script_id(word_sid));
682  print_char_choices_list("", char_choices, getUnicharset(),
683  permute_debug > 1);
684  }
685 
686  WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset());
687  BLOB_CHOICE_IT blob_choice_it;
688  bool replaced = false;
689  bool prev_is_consistent = false;
690  float certainties[MAX_PERM_LENGTH + 1];
691  for (int x = 0; x < char_choices.length(); ++x) {
692  blob_choice_it.set_to_list(char_choices.get(x));
693  BLOB_CHOICE *first_choice = blob_choice_it.data();
694  if (!first_choice) {
695  delete current_word;
696  return NULL;
697  }
698  UNICHAR_ID unichar_id = first_choice->unichar_id();
699  if (unichar_id == 0) {
700  delete current_word;
701  return NULL;
702  }
703  bool sid_consistent = (getUnicharset().get_script(unichar_id) == word_sid);
704  bool this_is_punct = getUnicharset().get_chartype(unichar_id) == 'p';
705  bool is_fragment = getUnicharset().get_fragment(unichar_id) != NULL;
706 
707  if (is_fragment) {
708  // Ignore any fragmented characters by skipping them to next choice
709  // (originally first choice).
710  first_choice = get_nth_choice(char_choices.get(x), 1);
711  ASSERT_HOST(first_choice != NULL);
712  } else if (!sid_consistent && !this_is_punct && prev_is_consistent) {
713  // If the previous char is CJK, we prefer a cjk over non-cjk char
714  if (permute_debug) {
715  tprintf("Checking %s r%g\n", getUnicharset().id_to_unichar(unichar_id),
716  first_choice->rating());
717  print_ratings_list("\t", char_choices.get(x), getUnicharset());
718  }
719  // prefer a script consistent choice
720  BLOB_CHOICE* c_it = find_choice_by_script(char_choices.get(x),
721  word_sid, 0, 0);
722  // otherwise, prefer a punctuation
723  if (c_it == NULL)
724  c_it = find_choice_by_type(char_choices.get(x), 'p', getUnicharset());
725 
726  if (c_it != NULL) {
727  if (permute_debug)
728  tprintf("Replacing %s r%g ==> %s r%g\n",
729  getUnicharset().id_to_unichar(unichar_id),
730  first_choice->rating(),
732  c_it->rating());
733  if (permuter_state)
734  permuter_state->AddPreference(x, c_it, segment_reward_script);
735  first_choice = c_it;
736  replaced = true;
737  }
738  }
739  current_word->append_unichar_id(first_choice->unichar_id(), 1,
740  first_choice->rating(),
741  first_choice->certainty());
742  certainties[x] = first_choice->certainty();
743  prev_is_consistent = sid_consistent;
744  }
745  // All permuter choices should go through adjust_non_word so the choice
746  // rating would be adjusted on the same scale.
747  adjust_non_word(current_word, certainties, &char_choices, permute_debug);
748  if (replaced) {
749  // Apply a reward multiplier on rating if an script permutation is made.
750  float rating = current_word->rating();
751  current_word->set_rating(rating * segment_reward_script);
752  if (permute_debug)
753  current_word->print("<== permute_script_word **");
754  }
755  return current_word;
756 }
void set_rating(float new_val)
Definition: ratngs.h:255
BLOB_CHOICE * find_choice_by_type(BLOB_CHOICE_LIST *blob_choices, char target_type, const UNICHARSET &unicharset)
Definition: permute.cpp:181
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices, const UNICHARSET &unicharset)
Definition: dict.cpp:908
void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length, float rating, float certainty)
Definition: ratngs.cpp:313
BLOB_CHOICE * get_nth_choice(BLOB_CHOICE_LIST *blob_list, int n)
Definition: permute.cpp:91
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T & get(int index) const
bool permute_debug
Definition: dict.h:872
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
BLOB_CHOICE * find_choice_by_script(BLOB_CHOICE_LIST *blob_choices, int target_sid, int backup_sid, int secondary_sid)
Definition: permute.cpp:206
float certainty() const
Definition: ratngs.h:65
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:552
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:511
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
const void print() const
Definition: ratngs.h:406
void print_char_choices_list(const char *msg, const BLOB_CHOICE_LIST_VECTOR &char_choices, const UNICHARSET &current_unicharset, BOOL8 detailed)
Definition: ratngs.cpp:610
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:502
void adjust_non_word(WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool debug)
Definition: dict.h:719
#define MAX_PERM_LENGTH
Definition: permute.h:36
int length() const
Definition: genericvector.h:63
#define MAX_WERD_LENGTH
Definition: dict.h:33
double segment_reward_script
Definition: dict.h:886
#define ASSERT_HOST(x)
Definition: errcode.h:84
float rating() const
Definition: ratngs.h:231
float rating() const
Definition: ratngs.h:62
void tesseract::Dict::permute_subword ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float  rating_limit,
int  start,
int  end,
WERD_CHOICE current_word 
)

permute_subword

Permute a part of a compound word this subword is bounded by hyphens and the start and end of the word. Call the standard word permute function on a set of choices covering only part of the original word. When it is done reclaim the memory that was used in the exercise.

Definition at line 859 of file permute.cpp.

863  {
864  int x;
865  BLOB_CHOICE_LIST_VECTOR subchoices;
866  WERD_CHOICE *best_choice = NULL;
867  WERD_CHOICE raw_choice(&getUnicharset());
868  raw_choice.make_bad();
869 
871 
872  for (x = start; x <= end; x++) {
873  if (char_choices.get(x) != NULL) {
874  subchoices += char_choices.get(x);
875  }
876  }
877 
878  if (!subchoices.empty()) {
879  WERD_CHOICE initial_choice(&getUnicharset());
880  initial_choice.make_bad();
881  initial_choice.set_rating(rating_limit);
882 
883  best_choice = permute_all(subchoices, &initial_choice, &raw_choice);
884 
885  if (best_choice && best_choice->length() > 0) {
886  *current_word += *best_choice;
887  } else {
888  current_word->set_rating(MAX_FLOAT32);
889  }
890  } else {
891  current_word->set_rating(MAX_FLOAT32);
892  }
893 
894  if (best_choice)
895  delete best_choice;
896 
897  if (segment_debug && current_word->rating() < MAX_FLOAT32) {
898  cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
899  current_word->debug_string().string(),
900  current_word->rating(), current_word->certainty());
901  }
903 }
int length() const
Definition: ratngs.h:214
void set_rating(float new_val)
Definition: ratngs.h:255
const STRING debug_string() const
Definition: ratngs.h:373
float certainty() const
Definition: ratngs.h:234
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T & get(int index) const
void EnableChoiceAccum()
Definition: dict.h:345
WERD_CHOICE * permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices, const WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice)
Definition: permute.cpp:331
int segment_debug
Definition: dict.h:871
const char * string() const
Definition: strngs.cpp:156
bool empty() const
Definition: genericvector.h:68
#define MAX_FLOAT32
Definition: host.h:124
void DisableChoiceAccum()
Definition: dict.h:344
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
float rating() const
Definition: ratngs.h:231
WERD_CHOICE * tesseract::Dict::permute_top_choice ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float *  rating_limit,
WERD_CHOICE raw_choice,
BOOL8 any_alpha 
)

permute_top_choice

Return the top choice for each character as the choice for the word. In addition a choice is created for the best lower and upper case non-words. In each character position the best lower (or upper) case character is substituted for the best overall character.

Definition at line 934 of file permute.cpp.

938  {
939  BLOB_CHOICE *first_choice;
940  const char *first_char; //first choice
941  const char *second_char; //second choice
942  const char *third_char; //third choice
943  char prev_char[UNICHAR_LEN + 1]; //prev in word
944  const char *next_char = ""; //next in word
945  const char *next_next_char = ""; //after next next in word
946 
948  word.set_permuter(TOP_CHOICE_PERM);
949  WERD_CHOICE capital_word(&getUnicharset(), MAX_PERM_LENGTH);
950  capital_word.set_permuter(UPPER_CASE_PERM);
951  WERD_CHOICE lower_word(&getUnicharset(), MAX_PERM_LENGTH);
952  lower_word.set_permuter(LOWER_CASE_PERM);
953 
954  int x;
955  BOOL8 char_alpha;
956  float first_rating = 0;
957 
958  float certainties[MAX_PERM_LENGTH + 1];
959  float lower_certainties[MAX_PERM_LENGTH + 1];
960  float upper_certainties[MAX_PERM_LENGTH + 1];
961 
962  BLOB_CHOICE_IT blob_choice_it;
963  UNICHAR_ID temp_id;
964  UNICHAR_ID unichar_id;
965  UNICHAR_ID space = getUnicharset().unichar_to_id(" ");
966  register const char* ch;
967  register inT8 lower_done;
968  register inT8 upper_done;
969 
970  prev_char[0] = '\0';
971 
972  if (any_alpha != NULL)
973  *any_alpha = FALSE;
974 
975  if (char_choices.length() > MAX_PERM_LENGTH) {
976  return (NULL);
977  }
978 
979  for (x = 0; x < char_choices.length(); ++x) {
980  if (x + 1 < char_choices.length()) {
981  unichar_id = get_top_choice_uid(char_choices.get(x+1));
982  next_char = unichar_id != INVALID_UNICHAR_ID ?
983  getUnicharset().id_to_unichar(unichar_id) : "";
984  } else {
985  next_char = "";
986  }
987 
988  if (x + 2 < char_choices.length()) {
989  unichar_id = get_top_choice_uid(char_choices.get(x+2));
990  next_next_char = unichar_id != INVALID_UNICHAR_ID ?
991  getUnicharset().id_to_unichar(unichar_id) : "";
992  } else {
993  next_next_char = "";
994  }
995 
996  blob_choice_it.set_to_list(char_choices.get(x));
997  ASSERT_HOST(!blob_choice_it.empty());
998  first_choice = NULL;
999  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
1000  blob_choice_it.forward()) { // find the best non-fragment char choice
1001  temp_id = blob_choice_it.data()->unichar_id();
1002  if (!(getUnicharset().get_fragment(temp_id))) {
1003  first_choice = blob_choice_it.data();
1004  break;
1005  } else if (char_choices.length() > 1) {
1006  word.set_fragment_mark(true);
1007  capital_word.set_fragment_mark(true);
1008  lower_word.set_fragment_mark(true);
1009  }
1010  }
1011  if (first_choice == NULL) {
1012  cprintf("Permuter found only fragments for"
1013  " character at position %d; word=%s\n",
1014  x, word.debug_string().string());
1015  }
1016  ASSERT_HOST(first_choice != NULL);
1017 
1018  unichar_id = first_choice->unichar_id() != INVALID_UNICHAR_ID ?
1019  first_choice->unichar_id() : space;
1020  first_char = getUnicharset().id_to_unichar(unichar_id);
1021  first_rating = first_choice->rating();
1022  word.append_unichar_id_space_allocated(
1023  unichar_id, 1, first_choice->rating(), first_choice->certainty());
1024  capital_word.append_unichar_id_space_allocated(
1025  unichar_id, 1, first_choice->rating(), first_choice->certainty());
1026  lower_word.append_unichar_id_space_allocated(
1027  unichar_id, 1, first_choice->rating(), first_choice->certainty());
1028 
1029  certainties[x] = first_choice->certainty();
1030  lower_certainties[x] = first_choice->certainty();
1031  upper_certainties[x] = first_choice->certainty();
1032 
1033  lower_done = FALSE;
1034  upper_done = FALSE;
1035  char_alpha = FALSE;
1036  second_char = "";
1037  third_char = "";
1038  for (; !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
1039  unichar_id = blob_choice_it.data()->unichar_id();
1040  if (getUnicharset().eq(unichar_id, "l") && !blob_choice_it.at_last() &&
1041  blob_choice_it.data_relative(1)->rating() == first_rating) {
1042  temp_id = blob_choice_it.data_relative(1)->unichar_id();
1043  if (getUnicharset().eq(temp_id, "1") ||
1044  getUnicharset().eq(temp_id, "I")) {
1045  second_char = getUnicharset().id_to_unichar(temp_id);
1046  blob_choice_it.forward();
1047  if (!blob_choice_it.at_last() &&
1048  blob_choice_it.data_relative(1)->rating() == first_rating) {
1049  temp_id = blob_choice_it.data_relative(1)->unichar_id();
1050  if (getUnicharset().eq(temp_id, "1") ||
1051  getUnicharset().eq(temp_id, "I")) {
1052  third_char = getUnicharset().id_to_unichar(temp_id);
1053  blob_choice_it.forward();
1054  }
1055  }
1056  ch = choose_il1 (first_char, second_char, third_char,
1057  prev_char, next_char, next_next_char);
1058  unichar_id = (ch != NULL && *ch != '\0') ?
1059  getUnicharset().unichar_to_id(ch) : INVALID_UNICHAR_ID;
1060  if (strcmp(ch, "l") != 0 &&
1061  getUnicharset().eq(word.unichar_id(x), "l")) {
1062  word.set_unichar_id(unichar_id, x);
1063  lower_word.set_unichar_id(unichar_id, x);
1064  capital_word.set_unichar_id(unichar_id, x);
1065  }
1066  }
1067  }
1068  if (unichar_id != INVALID_UNICHAR_ID) {
1069  /* Find lower case */
1070  if (!lower_done &&
1071  (getUnicharset().get_islower(unichar_id) ||
1072  (getUnicharset().get_isupper(unichar_id) && x == 0))) {
1073  lower_word.set_unichar_id(unichar_id, x);
1074  lower_word.set_rating(lower_word.rating() -
1075  first_choice->rating() + blob_choice_it.data()->rating());
1076  if (blob_choice_it.data()->certainty() < lower_word.certainty()) {
1077  lower_word.set_certainty(blob_choice_it.data()->certainty());
1078  }
1079  lower_certainties[x] = blob_choice_it.data()->certainty();
1080  lower_done = TRUE;
1081  }
1082  /* Find upper case */
1083  if (!upper_done && getUnicharset().get_isupper(unichar_id)) {
1084  capital_word.set_unichar_id(unichar_id, x);
1085  capital_word.set_rating(capital_word.rating() -
1086  first_choice->rating() + blob_choice_it.data()->rating());
1087  if (blob_choice_it.data()->certainty() < capital_word.certainty()) {
1088  capital_word.set_certainty(blob_choice_it.data()->certainty());
1089  }
1090  upper_certainties[x] = blob_choice_it.data()->certainty();
1091  upper_done = TRUE;
1092  }
1093  if (!char_alpha) {
1094  const CHAR_FRAGMENT *fragment =
1095  getUnicharset().get_fragment(unichar_id);
1096  temp_id = !fragment ? unichar_id :
1097  getUnicharset().unichar_to_id(fragment->get_unichar());
1098  if (getUnicharset().get_isalpha(temp_id)) {
1099  char_alpha = TRUE;
1100  }
1101  }
1102  if (lower_done && upper_done)
1103  break;
1104  }
1105  }
1106  if (char_alpha && any_alpha != NULL)
1107  *any_alpha = TRUE;
1108 
1109  if (word.rating() > bestrate_pruning_factor * *rating_limit) {
1110  if (permute_debug)
1111  tprintf("\n***** Aborting high-cost word: %g > limit %g\n",
1112  word.rating(), bestrate_pruning_factor * *rating_limit);
1113  return (NULL);
1114  }
1115 
1116  *prev_char = '\0';
1117  temp_id = word.unichar_id(word.length()-1);
1118  if (temp_id != INVALID_UNICHAR_ID) {
1119  strcpy(prev_char, getUnicharset().id_to_unichar(temp_id));
1120  }
1121  }
1122 
1123  if (raw_choice != NULL && word.rating() < raw_choice->rating()) {
1124  *raw_choice = word;
1125  LogNewChoice(1.0, certainties, true, raw_choice, char_choices);
1126  }
1127  float rating = word.rating();
1128  adjust_non_word(&word, certainties, &char_choices, permute_debug);
1129 
1130  float lower_rating = lower_word.rating();
1131  adjust_non_word(&lower_word, lower_certainties, &char_choices,
1132  permute_debug);
1133 
1134  float upper_rating = capital_word.rating();
1135  adjust_non_word(&capital_word, upper_certainties, &char_choices,
1136  permute_debug);
1137 
1138  WERD_CHOICE *best_choice = &word;
1139  *rating_limit = rating;
1140  if (lower_word.rating() < best_choice->rating()) {
1141  best_choice = &lower_word;
1142  *rating_limit = lower_rating;
1143  }
1144  if (capital_word.rating() < best_choice->rating()) {
1145  best_choice = &capital_word;
1146  *rating_limit = upper_rating;
1147  }
1148  return new WERD_CHOICE(*best_choice);
1149 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
UNICHAR_ID get_top_choice_uid(BLOB_CHOICE_LIST *blob_list)
Definition: permute.cpp:99
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
const char * choose_il1(const char *first_char, const char *second_char, const char *third_char, const char *prev_char, const char *next_char, const char *next_next_char)
Definition: permute.cpp:1163
unsigned char BOOL8
Definition: host.h:113
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T & get(int index) const
bool permute_debug
Definition: dict.h:872
#define FALSE
Definition: capi.h:28
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void adjust_non_word(WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool debug)
Definition: dict.h:719
#define MAX_PERM_LENGTH
Definition: permute.h:36
int length() const
Definition: genericvector.h:63
SIGNED char inT8
Definition: host.h:98
#define UNICHAR_LEN
Definition: unichar.h:28
const char * get_unichar() const
Definition: unicharset.h:52
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
#define ASSERT_HOST(x)
Definition: errcode.h:84
double bestrate_pruning_factor
Definition: dict.h:874
float rating() const
Definition: ratngs.h:231
#define TRUE
Definition: capi.h:27
void LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[], bool raw_choice, WERD_CHOICE *WordChoice, const BLOB_CHOICE_LIST_VECTOR &blob_choices)
Definition: stopper.cpp:484
void tesseract::Dict::PrintAmbigAlternatives ( FILE *  file,
const char *  label,
int  label_num_unichars 
)

Print all the choices in raw_choices_ list for non 1-1 ambiguities.

Definition at line 358 of file stopper.cpp.

359  {
360  iterate(raw_choices_) {
361  VIABLE_CHOICE Choice = (VIABLE_CHOICE)first_node(raw_choices_);
362  if (Choice->Length > 0 &&
363  (label_num_unichars > 1 || Choice->Length > 1)) {
364  for (int i = 0; i < Choice->Length; i++) {
365  fprintf(file, "%s",
366  getUnicharset().id_to_unichar(Choice->Blob[i].Class));
367  }
368  fflush(file);
369  fprintf(file, "\t%s\t%.4f\t%.4f\n", label,
370  Choice->Rating, Choice->Certainty);
371  }
372  }
373 }
CHAR_CHOICE * Blob
Definition: stopper.h:74
UNICHAR_ID Class
Definition: stopper.h:51
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
VIABLE_CHOICE_STRUCT * VIABLE_CHOICE
Definition: stopper.h:86
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
void tesseract::Dict::PrintViableChoice ( FILE *  File,
const char *  Label,
VIABLE_CHOICE  Choice 
)

Dumps a text representation of the specified Choice to File.

Definition at line 912 of file stopper.cpp.

912  {
913  int i, j;
914  fprintf (File, "%s", Label);
915  fprintf(File, "(R=%5.1f, C=%4.1f, F=%4.2f, Frag=%d) ",
916  Choice->Rating, Choice->Certainty,
917  Choice->AdjustFactor, Choice->ComposedFromCharFragments);
918 
919  for (i = 0; i < Choice->Length; i++)
920  fprintf(File, "%s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
921  fprintf(File, "\n");
922 
923  for (i = 0; i < Choice->Length; i++) {
924  fprintf(File, " %s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
925  for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++)
926  fprintf(File, " ");
927  }
928  fprintf(File, "\n");
929 
930  for (i = 0; i < Choice->Length; i++) {
931  for (j = 0; j < Choice->Blob[i].NumChunks; j++)
932  fprintf(File, "%3d ", (int) (Choice->Blob[i].Certainty * -10.0));
933  }
934  fprintf(File, "\n");
935 
936  for (i = 0; i < Choice->Length; i++) {
937  for (j = 0; j < Choice->Blob[i].NumChunks; j++)
938  fprintf(File, "%3d ", Choice->Blob[i].NumChunks);
939  }
940  fprintf(File, "\n");
941 }
CHAR_CHOICE * Blob
Definition: stopper.h:74
UNICHAR_ID Class
Definition: stopper.h:51
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
uinT16 NumChunks
Definition: stopper.h:52
bool ComposedFromCharFragments
Definition: stopper.h:73
FLOAT32 AdjustFactor
Definition: stopper.h:72
float Certainty
Definition: stopper.h:53
double tesseract::Dict::ProbabilityInContext ( const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Calls probability_in_context_ member function.

Definition at line 576 of file dict.h.

579  {
580  return (this->*probability_in_context_)(
581  getImage()->getCCUtil()->lang.string(),
582  context, context_bytes,
583  character, character_bytes);
584  }
const CCUtil * getCCUtil() const
Definition: image.h:29
const char * string() const
Definition: strngs.cpp:156
STRING lang
Definition: ccutil.h:69
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:570
const Image * getImage() const
Definition: dict.h:94
void tesseract::Dict::ProcessPatternEdges ( const Dawg dawg,
const DawgInfo info,
UNICHAR_ID  unichar_id,
bool  word_end,
DawgArgs dawg_args,
PermuterType *  current_permuter 
) const

For each of the character classes of the given unichar_id (and the unichar_id itself) finds the corresponding outgoing node or self-loop in the given dawg and (after checking that it is valid) records it in dawg_args->updated_ative_dawgs. Updates current_permuter if any valid edges were found.

Definition at line 554 of file dict.cpp.

557  {
558  NODE_REF node = GetStartingNode(dawg, info.ref);
559  // Try to find the edge corresponding to the exact unichar_id and to all the
560  // edges corresponding to the character class of unichar_id.
561  GenericVector<UNICHAR_ID> unichar_id_patterns;
562  unichar_id_patterns.push_back(unichar_id);
563  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
564  &unichar_id_patterns);
565  for (int i = 0; i < unichar_id_patterns.size(); ++i) {
566  // On the first iteration check all the outgoing edges.
567  // On the second iteration check all self-loops.
568  for (int k = 0; k < 2; ++k) {
569  EDGE_REF edge = (k == 0) ?
570  dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
571  : dawg->pattern_loop_edge(info.ref, unichar_id_patterns[i], word_end);
572  if (edge != NO_EDGE) {
573  if (dawg_debug_level >= 3) {
574  tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
575  info.dawg_index, node, edge);
576  }
577  if (ConstraintsOk(*(dawg_args->updated_constraints),
578  word_end, dawg->type())) {
579  if (dawg_debug_level >=3) {
580  tprintf("Letter found in pattern dawg %d\n", info.dawg_index);
581  }
582  if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
583  dawg_args->updated_active_dawgs->add_unique(
584  DawgInfo(info.dawg_index, edge), dawg_debug_level > 0,
585  "Append current dawg to updated active dawgs: ");
586  }
587  }
588  }
589  }
590 }
#define REFFORMAT
Definition: dawg.h:92
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:620
inT64 EDGE_REF
Definition: dawg.h:54
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
int push_back(T object)
bool ConstraintsOk(const DawgInfoVector &constraints, int word_end, DawgType current_dawg_type) const
Definition: dict.h:631
int dawg_debug_level
Definition: dict.h:839
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT64 NODE_REF
Definition: dawg.h:55
int size() const
Definition: genericvector.h:59
void tesseract::Dict::ReadFixedLengthDawgs ( DawgType  type,
const STRING lang,
PermuterType  perm,
int  debug_level,
FILE *  file,
DawgVector dawg_vec,
int *  max_wdlen 
)
static

Read/Write/Access special purpose dawgs which contain words only of a certain length (used for phrase search for non-space-delimited languages). Reads a sequence of dawgs from the given file. Appends the constructed dawgs to the given dawg_vec. Fills the given table with indices of the dawgs in the dawg_vec corresponding to the dawgs with words of a particular length.

Definition at line 592 of file dict.cpp.

595  {
596  int i;
597  DawgVector dawg_vec_copy;
598  dawg_vec_copy.move(dawg_vec); // save the input dawg_vec.
599  inT32 num_dawgs;
600  fread(&num_dawgs, sizeof(inT32), 1, file);
601  bool swap = (num_dawgs > MAX_WERD_LENGTH);
602  if (swap) num_dawgs = reverse32(num_dawgs);
603  inT32 word_length;
604  int max_word_length = 0;
605  // Read and record pointers to fixed-length dawgs such that:
606  // dawg_vec[word_length] = pointer to dawg with word length of word_length,
607  // NULL if such fixed-length dawg does not exist.
608  for (i = 0; i < num_dawgs; ++i) {
609  fread(&word_length, sizeof(inT32), 1, file);
610  if (swap) word_length = reverse32(word_length);
611  ASSERT_HOST(word_length > 0 && word_length <= MAX_WERD_LENGTH);
612  while (word_length >= dawg_vec->size()) dawg_vec->push_back(NULL);
613  (*dawg_vec)[word_length] =
614  new SquishedDawg(file, type, lang, perm, debug_level);
615  if (word_length > max_word_length) max_word_length = word_length;
616  }
617  *max_wdlen = max_word_length;
618  // Entries dawg_vec[0] to dawg_vec[max_word_length] now hold pointers
619  // to fixed-length dawgs. The rest of the vector will contain the dawg
620  // pointers from the original input dawg_vec.
621  for (i = 0; i < dawg_vec_copy.size(); ++i) {
622  dawg_vec->push_back(dawg_vec_copy[i]);
623  }
624 }
GenericVector< Dawg * > DawgVector
Definition: dict.h:47
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
DLLSYM uinT32 reverse32(uinT32 num)
Definition: serialis.cpp:36
void move(GenericVector< T > *from)
#define MAX_WERD_LENGTH
Definition: dict.h:33
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::Dict::remove_hyphen_head ( WERD_CHOICE word) const
inline

Erase the unichar ids corresponding to the portion of the word from the previous line. The word is not changed if it is not split between lines and hyphenated.

Definition at line 137 of file dict.h.

137  {
138  if (this->hyphenated()) {
139  word->remove_unichar_ids(0, hyphen_word_->length());
140  if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: ");
141  }
142  }
int length() const
Definition: ratngs.h:214
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:118
void remove_unichar_ids(int index, int num)
Definition: ratngs.cpp:213
int hyphen_debug_level
Definition: dict.h:840
const void print() const
Definition: ratngs.h:406
void tesseract::Dict::ReplaceAmbig ( int  wrong_ngram_begin_index,
int  wrong_ngram_size,
UNICHAR_ID  correct_ngram_id,
WERD_CHOICE werd_choice,
BLOB_CHOICE_LIST_VECTOR blob_choices,
bool *  modified_blobs 
)

Replaces the corresponding wrong ngram in werd_choice with the correct one. We indicate that this newly inserted ngram unichar is composed from several fragments and modify the corresponding entries in blob_choices to contain fragments of the correct ngram unichar instead of the original unichars. Ratings and certainties of entries in blob_choices and werd_choice are unichaged. E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes mystring", first ' in blob_choices becomes |"|0|2, second one is set to |"|1|2.

Definition at line 802 of file stopper.cpp.

805  {
806  int num_blobs_to_replace = 0;
807  int begin_blob_index = 0;
808  int i;
809  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
810  if (i >= wrong_ngram_begin_index) {
811  num_blobs_to_replace += werd_choice->fragment_length(i);
812  } else {
813  begin_blob_index += werd_choice->fragment_length(i);
814  }
815  }
816  BLOB_CHOICE_IT bit;
817  int temp_blob_index = begin_blob_index;
818  const char *temp_uch = NULL;
819  const char *correct_ngram_str =
820  getUnicharset().id_to_unichar(correct_ngram_id);
821  for (int replaced_count = 0; replaced_count < wrong_ngram_size;
822  ++replaced_count) {
823  if (blob_choices != NULL) {
824  UNICHAR_ID uch_id = werd_choice->unichar_id(wrong_ngram_begin_index);
825  int fraglen = werd_choice->fragment_length(wrong_ngram_begin_index);
826  if (fraglen > 1) temp_uch = getUnicharset().id_to_unichar(uch_id);
827  for (i = 0; i < fraglen; ++i) {
828  if (fraglen > 1) {
829  STRING frag_str =
830  CHAR_FRAGMENT::to_string(temp_uch, i, fraglen, false);
831  getUnicharset().unichar_insert(frag_str.string());
832  uch_id = getUnicharset().unichar_to_id(frag_str.string());
833  }
834  bit.set_to_list(blob_choices->get(temp_blob_index));
835  STRING correct_frag_uch =
836  CHAR_FRAGMENT::to_string(correct_ngram_str,
837  temp_blob_index - begin_blob_index,
838  num_blobs_to_replace, false);
839  getUnicharset().unichar_insert(correct_frag_uch.string());
840  UNICHAR_ID correct_frag_uch_id =
841  getUnicharset().unichar_to_id(correct_frag_uch.string());
842  // Find the WERD_CHOICE corresponding to the original unichar in
843  // the list of blob choices, add the derived character fragment
844  // before it with the same rating and certainty.
845  for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) {
846  if (bit.data()->unichar_id() == correct_frag_uch_id) {
847  break; // the unichar we want to insert is already there
848  }
849  if (bit.data()->unichar_id() == uch_id) {
850  bit.add_before_then_move(new BLOB_CHOICE(*(bit.data())));
851  bit.data()->set_unichar_id(correct_frag_uch_id);
852  if (modified_blobs != NULL) *modified_blobs = true;
853  break;
854  }
855  }
856  temp_blob_index++;
857  }
858  }
859  // Remove current unichar from werd_choice. On the last iteration
860  // set the correct replacement unichar instead of removing a unichar.
861  if (replaced_count + 1 == wrong_ngram_size) {
862  werd_choice->set_unichar_id(correct_ngram_id,
863  num_blobs_to_replace, 0.0, 0.0, wrong_ngram_begin_index);
864  } else {
865  werd_choice->remove_unichar_id(wrong_ngram_begin_index);
866  }
867  }
868  if (stopper_debug_level >= 1 && modified_blobs != NULL &&
869  *modified_blobs && blob_choices != NULL) {
870  werd_choice->print("ReplaceAmbig() ");
871  tprintf("Modified blob_choices: ");
872  for (int i = 0; i < blob_choices->size(); ++i) {
873  print_ratings_list("\n", blob_choices->get(i), getUnicharset());
874  }
875  }
876 }
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
STRING to_string() const
Definition: unicharset.h:61
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T & get(int index) const
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:511
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:511
const char * string() const
Definition: strngs.cpp:156
const void print() const
Definition: ratngs.h:406
int stopper_debug_level
Definition: dict.h:856
void remove_unichar_id(int index)
Definition: ratngs.h:357
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const char fragment_length(int index) const
Definition: ratngs.h:227
Definition: strngs.h:40
int size() const
Definition: genericvector.h:59
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:247
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
void tesseract::Dict::reset_hyphen_vars ( bool  last_word_on_line)

Unless the previous word was the last one on the line, and the current one is not (thus it is the first one on the line), erase hyphen_word_, clear hyphen_active_dawgs_, hyphen_constraints_ update last_word_on_line_.

Definition at line 32 of file hyphen.cpp.

32  {
33  if (!(last_word_on_line_ == true && last_word_on_line == false)) {
34  if (hyphen_word_ != NULL) {
35  delete hyphen_word_;
36  hyphen_word_ = NULL;
37  hyphen_active_dawgs_.clear();
38  hyphen_constraints_.clear();
39  }
40  }
41  if (hyphen_debug_level) {
42  tprintf("reset_hyphen_vars: last_word_on_line %d -> %d\n",
43  last_word_on_line_, last_word_on_line);
44  }
45  last_word_on_line_ = last_word_on_line;
46 }
#define NULL
Definition: host.h:144
int hyphen_debug_level
Definition: dict.h:840
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void tesseract::Dict::ResetDocumentDictionary ( )
inline

Definition at line 478 of file dict.h.

478  {
479  if (pending_words_ != NULL)
480  pending_words_->clear();
481  if (document_words_ != NULL)
482  document_words_->clear();
483  }
void clear()
Definition: trie.cpp:65
#define NULL
Definition: host.h:144
void tesseract::Dict::set_hyphen_word ( const WERD_CHOICE word,
const DawgInfoVector active_dawgs,
const DawgInfoVector constraints 
)

Update hyphen_word_, and copy the given DawgInfoVectors into hyphen_active_dawgs_ and hyphen_constraints_.

Definition at line 50 of file hyphen.cpp.

52  {
53  if (hyphen_word_ == NULL) {
54  hyphen_word_ = new WERD_CHOICE(word.unicharset());
55  hyphen_word_->make_bad();
56  }
57  if (hyphen_word_->rating() > word.rating()) {
58  *hyphen_word_ = word;
59  // Remove the last unichar id as it is a hyphen, and remove
60  // any unichar_string/lengths that are present.
61  hyphen_word_->remove_last_unichar_id();
62  hyphen_active_dawgs_ = active_dawgs;
63  hyphen_constraints_ = constraints;
64  }
65  if (hyphen_debug_level) {
66  hyphen_word_->print("set_hyphen_word: ");
67  }
68 }
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:321
#define NULL
Definition: host.h:144
int hyphen_debug_level
Definition: dict.h:840
const void print() const
Definition: ratngs.h:406
void remove_last_unichar_id()
Definition: ratngs.h:356
const UNICHARSET * unicharset() const
Definition: ratngs.h:211
float rating() const
Definition: ratngs.h:231
void tesseract::Dict::SettupStopperPass1 ( )

Sets up stopper variables in preparation for the first pass.

Definition at line 780 of file stopper.cpp.

780  {
781  reject_offset_ = 0.0;
782 }
void tesseract::Dict::SettupStopperPass2 ( )

Sets up stopper variables in preparation for the second pass.

Definition at line 784 of file stopper.cpp.

784  {
786 }
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:849
void tesseract::Dict::SetWordsegRatingAdjustFactor ( float  f)
inline

Set wordseg_rating_adjust_factor_ to the given value.

Definition at line 726 of file dict.h.

726  {
727  wordseg_rating_adjust_factor_ = f;
728  }
#define f(xc, yc)
Definition: imgscale.cpp:39
double tesseract::Dict::StopperAmbigThreshold ( double  f1,
double  f2 
)
inline

Definition at line 322 of file dict.h.

322  {
323  return (f2 - f1) * stopper_ambiguity_threshold_gain -
325  }
double stopper_ambiguity_threshold_offset
Definition: dict.h:863
double stopper_ambiguity_threshold_gain
Definition: dict.h:861
bool tesseract::Dict::StringSameAs ( const WERD_CHOICE WordChoice,
VIABLE_CHOICE  ViableChoice 
)

Compares unichar ids in word_choice to those in viable_choice, returns true if they are the same.

Definition at line 951 of file stopper.cpp.

952  {
953  if (WordChoice.length() != ViableChoice->Length) {
954  return false;
955  }
956  int i;
957  CHAR_CHOICE *CharChoice;
958  for (i = 0, CharChoice = &(ViableChoice->Blob[0]);
959  i < ViableChoice->Length; CharChoice++, i++) {
960  if (CharChoice->Class != WordChoice.unichar_id(i)) {
961  return false;
962  }
963  }
964  return true;
965 }
int length() const
Definition: ratngs.h:214
CHAR_CHOICE * Blob
Definition: stopper.h:74
UNICHAR_ID Class
Definition: stopper.h:51
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
bool tesseract::Dict::StringSameAs ( const char *  String,
const char *  String_lengths,
VIABLE_CHOICE  ViableChoice 
)

Compares String to ViableChoice and returns true if they are the same.

Definition at line 967 of file stopper.cpp.

969  {
970  CHAR_CHOICE *Char;
971  int i;
972  int current_unichar_length;
973 
974  for (Char = &(ViableChoice->Blob[0]), i = 0;
975  i < ViableChoice->Length;
976  String += *(String_lengths++), Char++, i++) {
977  current_unichar_length = strlen(getUnicharset().id_to_unichar(Char->Class));
978  if (current_unichar_length != *String_lengths ||
979  strncmp(String, getUnicharset().id_to_unichar(Char->Class),
980  current_unichar_length) != 0)
981  return false;
982  }
983  return (*String == 0) ? true : false;
984 }
CHAR_CHOICE * Blob
Definition: stopper.h:74
UNICHAR_ID Class
Definition: stopper.h:51
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
WERD_CHOICE * tesseract::Dict::top_fragments_permute_and_select ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float  rating_limit 
)

top_fragments_permute_and_select

Creates a copy of character choices list that contain only fragments and the best non-fragmented character choice. Permutes character in this shortened list, builds characters from fragments if possible and returns a better choice if found.

Definition at line 1367 of file permute.cpp.

1369  {
1370  if (char_choices.length() <= 1 ||
1371  char_choices.length() > MAX_PERM_LENGTH) {
1372  return NULL;
1373  }
1374  // See it would be possible to benefit from permuting fragments.
1375  int x;
1376  float min_rating = 0.0;
1377  BLOB_CHOICE_IT blob_choice_it;
1378  for (x = 0; x < char_choices.length(); ++x) {
1379  blob_choice_it.set_to_list(char_choices.get(x));
1380  if (blob_choice_it.data()) {
1381  min_rating += blob_choice_it.data()->rating();
1382  }
1383  if (min_rating >= rating_limit) {
1384  return NULL;
1385  }
1386  }
1387  if (fragments_debug > 1) {
1388  tprintf("A choice with fragment beats top choice\n");
1389  tprintf("Running fragment permuter...\n");
1390  }
1391 
1392  // Construct a modified choices list that contains (for each position):
1393  // the best choice, all fragments and at least one choice for
1394  // a non-fragmented character.
1395  BLOB_CHOICE_LIST_VECTOR frag_char_choices(char_choices.length());
1396  for (x = 0; x < char_choices.length(); ++x) {
1397  bool need_nonfrag_char = true;
1398  BLOB_CHOICE_LIST *frag_choices = new BLOB_CHOICE_LIST();
1399  BLOB_CHOICE_IT frag_choices_it;
1400  frag_choices_it.set_to_list(frag_choices);
1401  blob_choice_it.set_to_list(char_choices.get(x));
1402  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
1403  blob_choice_it.forward()) {
1404  if (getUnicharset().get_fragment(blob_choice_it.data()->unichar_id())) {
1405  frag_choices_it.add_after_then_move(
1406  new BLOB_CHOICE(*(blob_choice_it.data())));
1407  } else if (need_nonfrag_char) {
1408  frag_choices_it.add_after_then_move(
1409  new BLOB_CHOICE(*(blob_choice_it.data())));
1410  need_nonfrag_char = false;
1411  }
1412  }
1413  frag_char_choices += frag_choices;
1414  }
1415 
1416  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
1417  best_choice->make_bad();
1419  word.set_permuter(TOP_CHOICE_PERM);
1420  float certainties[MAX_PERM_LENGTH];
1422  int attempts_left = max_permuter_attempts;
1423  permute_choices((fragments_debug > 1) ? "fragments_debug" : NULL,
1424  frag_char_choices, 0, NULL, &word, certainties,
1425  &rating_limit, best_choice, &attempts_left, NULL);
1426 
1427  frag_char_choices.delete_data_pointers();
1428  return best_choice;
1429 }
void go_deeper_top_fragments_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permute.cpp:1538
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:321
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function that will be modified by various permuters.
Definition: dict.h:308
T & get(int index) const
int max_permuter_attempts
Definition: dict.h:909
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permute.cpp:1437
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
#define MAX_PERM_LENGTH
Definition: permute.h:36
int length() const
Definition: genericvector.h:63
int fragments_debug
Definition: dict.h:870
char tesseract::Dict::top_word_chartype ( const BLOB_CHOICE_LIST_VECTOR char_choices,
char *  pos_chartypes 
)

Look up the main chartype for each character position and store it in the given array. Also returns the dominant type from unambiguous top choices.

Definition at line 512 of file permute.cpp.

513  {
514  const UNICHARSET &unicharset = getUnicharset();
515  const int hist_size = 128; // to contain 'A','a','0','x','p'
516  int chprop[hist_size];
517  int x;
518  for (x = 0; x < hist_size; x++) chprop[x] = 0;
519  for (x = 0; x < char_choices.length(); ++x) {
520  UNICHAR_ID unichar_id = get_top_choice_uid(char_choices.get(x));
521  char ctype = unicharset.get_chartype(unichar_id);
522  if (pos_chartypes) pos_chartypes[x] = ctype;
523  if (ctype == 0 || ctype == 'p') continue;
524  if (getUnicharAmbigs().OneToOneDefiniteAmbigs(unichar_id) != NULL) continue;
525  chprop[ctype]++;
526  if (x == 0 && ctype == 'A') // first-cap also counts as lower
527  chprop['a']++;
528  }
529  int max_prop = 0;
530  for (x = 1; x < hist_size; x++)
531  if (chprop[x] >= chprop[max_prop]) max_prop = x;
532  return (chprop[max_prop] > 0) ? max_prop : 0;
533 }
UNICHAR_ID get_top_choice_uid(BLOB_CHOICE_LIST *blob_list)
Definition: permute.cpp:99
int UNICHAR_ID
Definition: unichar.h:31
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T & get(int index) const
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:502
int length() const
Definition: genericvector.h:63
const UnicharAmbigs & getUnicharAmbigs()
Definition: dict.h:106
int tesseract::Dict::UniformCertainties ( const BLOB_CHOICE_LIST_VECTOR Choices,
const WERD_CHOICE BestChoice 
)

Returns true if the certainty of the BestChoice word is within a reasonable range of the average certainties for the best choices for each character in the segmentation. This test is used to catch words in which one character is much worse than the other characters in the word (i.e. false will be returned in that case). The algorithm computes the mean and std deviation of the certainties in the word with the worst certainty thrown out.

Definition at line 986 of file stopper.cpp.

987  {
988  float Certainty;
989  float WorstCertainty = MAX_FLOAT32;
990  float CertaintyThreshold;
991  FLOAT64 TotalCertainty;
992  FLOAT64 TotalCertaintySquared;
993  FLOAT64 Variance;
994  FLOAT32 Mean, StdDev;
995  int WordLength;
996 
997  WordLength = Choices.length();
998  if (WordLength < 3)
999  return true;
1000 
1001  TotalCertainty = TotalCertaintySquared = 0.0;
1002  BLOB_CHOICE_IT BlobChoiceIt;
1003  for (int i = 0; i < Choices.length(); ++i) {
1004  BlobChoiceIt.set_to_list(Choices.get(i));
1005  Certainty = BlobChoiceIt.data()->certainty();
1006  TotalCertainty += Certainty;
1007  TotalCertaintySquared += Certainty * Certainty;
1008  if (Certainty < WorstCertainty)
1009  WorstCertainty = Certainty;
1010  }
1011 
1012  // Subtract off worst certainty from statistics.
1013  WordLength--;
1014  TotalCertainty -= WorstCertainty;
1015  TotalCertaintySquared -= WorstCertainty * WorstCertainty;
1016 
1017  Mean = TotalCertainty / WordLength;
1018  Variance = ((WordLength * TotalCertaintySquared -
1019  TotalCertainty * TotalCertainty) /
1020  (WordLength * (WordLength - 1)));
1021  if (Variance < 0.0)
1022  Variance = 0.0;
1023  StdDev = sqrt (Variance);
1024 
1025  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
1026  if (CertaintyThreshold > stopper_nondict_certainty_base)
1027  CertaintyThreshold = stopper_nondict_certainty_base;
1028 
1029  if (BestChoice.certainty() < CertaintyThreshold) {
1030  if (stopper_debug_level >= 1)
1031  cprintf("Stopper: Non-uniform certainty = %4.1f"
1032  " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
1033  BestChoice.certainty(), Mean, StdDev, CertaintyThreshold);
1034  return false;
1035  } else {
1036  return true;
1037  }
1038 }
float certainty() const
Definition: ratngs.h:234
T & get(int index) const
float FLOAT32
Definition: host.h:111
double FLOAT64
Definition: host.h:112
double stopper_allowable_character_badness
Definition: dict.h:855
int stopper_debug_level
Definition: dict.h:856
double stopper_nondict_certainty_base
Definition: dict.h:847
#define MAX_FLOAT32
Definition: host.h:124
int length() const
Definition: genericvector.h:63
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:639
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
void tesseract::Dict::update_best_choice ( const WERD_CHOICE word,
WERD_CHOICE best_choice 
)
inline

Copies word into best_choice if its rating is smaller than that of best_choice.

Definition at line 166 of file dict.h.

167  {
168  if (word.rating() < best_choice->rating()) *best_choice = word;
169  }
float rating() const
Definition: ratngs.h:231
bool tesseract::Dict::valid_bigram ( const WERD_CHOICE word1,
const WERD_CHOICE word2 
) const

Definition at line 849 of file dict.cpp.

850  {
851  if (bigram_dawg_ == NULL) return false;
852 
853  // Extract the core word from the middle of each word with any digits
854  // replaced with question marks.
855  int w1start, w1end, w2start, w2end;
856  word1.punct_stripped(&w1start, &w1end);
857  word2.punct_stripped(&w2start, &w2end);
858 
859  // We don't want to penalize a single guillemet, hyphen, etc.
860  // But our bigram list doesn't have any information about punctuation.
861  if (w1start >= w1end) return word1.length() < 3;
862  if (w2start >= w2end) return word2.length() < 3;
863 
864  const UNICHARSET& uchset = getUnicharset();
865  STRING bigram_string;
866  for (int i = w1start; i < w1end; i++) {
868  bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch);
869  }
870  bigram_string += " ";
871  for (int i = w2start; i < w2end; i++) {
873  bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch);
874  }
875  WERD_CHOICE normalized_word(bigram_string.string(), uchset);
876  return bigram_dawg_->word_in_dawg(normalized_word);
877 }
int length() const
Definition: ratngs.h:214
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:245
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:48
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
UNICHAR_ID NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const
Definition: dict.cpp:368
Definition: strngs.h:40
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
bool tesseract::Dict::valid_punctuation ( const WERD_CHOICE word)

Returns true if the word contains a valid punctuation pattern. Note: Since the domains of punctuation symbols and symblos used in numbers are not disjoint, a valid number might contain an invalid punctuation pattern (e.g. .99).

Definition at line 879 of file dict.cpp.

879  {
880  if (word.length() == 0) return NO_PERM;
881  int i;
882  WERD_CHOICE new_word(word.unicharset());
883  int last_index = word.length() - 1;
884  int new_len = 0;
885  for (i = 0; i <= last_index; ++i) {
886  UNICHAR_ID unichar_id = (word.unichar_id(i));
887  if (getUnicharset().get_ispunctuation(unichar_id)) {
888  new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
889  } else if (!getUnicharset().get_isalpha(unichar_id) &&
890  !getUnicharset().get_isdigit(unichar_id)) {
891  return false; // neither punc, nor alpha, nor digit
892  } else if ((new_len = new_word.length()) == 0 ||
893  new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
894  new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
895  }
896  }
897  for (i = 0; i < dawgs_.size(); ++i) {
898  if (dawgs_[i] != NULL &&
899  dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
900  dawgs_[i]->word_in_dawg(new_word)) return true;
901  }
902  return false;
903 }
int length() const
Definition: ratngs.h:214
int UNICHAR_ID
Definition: unichar.h:31
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:420
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:129
int size() const
Definition: genericvector.h:59
const UNICHARSET * unicharset() const
Definition: ratngs.h:211
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
int tesseract::Dict::valid_word ( const WERD_CHOICE word,
bool  numbers_ok 
) const

Definition at line 807 of file dict.cpp.

807  {
808  const WERD_CHOICE *word_ptr = &word;
809  WERD_CHOICE temp_word(word.unicharset());
810  if (hyphenated()) {
811  copy_hyphen_info(&temp_word);
812  temp_word += word;
813  word_ptr = &temp_word;
814  }
815  if (word_ptr->length() == 0) return NO_PERM;
816  // Allocate vectors for holding current and updated
817  // active_dawgs and constraints and initialize them.
818  DawgInfoVector *active_dawgs = new DawgInfoVector[2];
819  DawgInfoVector *constraints = new DawgInfoVector[2];
820  init_active_dawgs(kAnyWordLength, &(active_dawgs[0]), false);
821  init_constraints(&(constraints[0]));
822  DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
823  &(active_dawgs[1]), &(constraints[1]),
824  0.0, NO_PERM, kAnyWordLength, 0);
825  int last_index = word_ptr->length() - 1;
826  // Call leter_is_okay for each letter in the word.
827  for (int i = hyphen_base_size(); i <= last_index; ++i) {
828  if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i),
829  i == last_index))) break;
830  // Swap active_dawgs, constraints with the corresponding updated vector.
831  if (dawg_args.updated_active_dawgs == &(active_dawgs[1])) {
832  dawg_args.updated_active_dawgs = &(active_dawgs[0]);
833  dawg_args.updated_constraints = &(constraints[0]);
834  ++(dawg_args.active_dawgs);
835  ++(dawg_args.constraints);
836  } else {
837  ++(dawg_args.updated_active_dawgs);
838  ++(dawg_args.updated_constraints);
839  dawg_args.active_dawgs = &(active_dawgs[0]);
840  dawg_args.constraints = &(constraints[0]);
841  }
842  }
843  delete[] active_dawgs;
844  delete[] constraints;
845  return valid_word_permuter(dawg_args.permuter, numbers_ok) ?
846  dawg_args.permuter : NO_PERM;
847 }
int length() const
Definition: ratngs.h:214
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:118
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:128
void init_active_dawgs(int sought_word_length, DawgInfoVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:643
void init_constraints(DawgInfoVector *constraints) const
Definition: dict.cpp:677
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:560
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:679
const UNICHARSET * unicharset() const
Definition: ratngs.h:211
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:122
int tesseract::Dict::valid_word ( const WERD_CHOICE word) const
inline

Definition at line 685 of file dict.h.

685  {
686  return valid_word(word, false); // return NO_PERM for words with digits
687  }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:807
int tesseract::Dict::valid_word ( const char *  string) const
inline

This function is used by api/tesseract_cube_combiner.cpp.

Definition at line 692 of file dict.h.

692  {
693  WERD_CHOICE word(string, getUnicharset());
694  return valid_word(word);
695  }
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:807
int tesseract::Dict::valid_word_or_number ( const WERD_CHOICE word) const
inline

Definition at line 688 of file dict.h.

688  {
689  return valid_word(word, true); // return NUMBER_PERM for valid numbers
690  }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:807
static bool tesseract::Dict::valid_word_permuter ( uinT8  perm,
bool  numbers_ok 
)
inlinestatic

Check all the DAWGs to see if this word is in any of them.

Definition at line 679 of file dict.h.

679  {
680  return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
681  perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
682  perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM));
683  }
void tesseract::Dict::WriteFixedLengthDawgs ( const GenericVector< SquishedDawg * > &  dawg_vec,
int  num_dawgs,
int  debug_level,
FILE *  output_file 
)
static

Writes the dawgs in the dawgs_vec to a file. Updates the given table with the indices of dawgs in the dawg_vec for the corresponding word lengths.

Definition at line 626 of file dict.cpp.

628  {
629  fwrite(&num_dawgs, sizeof(inT32), 1, output_file);
630  if (debug_level) tprintf("Writing %d split length dawgs\n", num_dawgs);
631  for (int i = 1; i < dawg_vec.size(); ++i) {
632  if ((dawg_vec)[i] != NULL) {
633  fwrite(&i, sizeof(inT32), 1, output_file);
634  dawg_vec[i]->write_squished_dawg(output_file);
635  if (debug_level) tprintf("Wrote Dawg with word length %d\n", i);
636  }
637  }
638 }
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int size() const
Definition: genericvector.h:59

Member Data Documentation

double tesseract::Dict::bestrate_pruning_factor = 2.0

"Multiplying factor of" " current best rate to prune other hypotheses"

Definition at line 874 of file dict.h.

double tesseract::Dict::certainty_scale = 20.0

"Certainty scaling factor"

Definition at line 845 of file dict.h.

int tesseract::Dict::dawg_debug_level = 0

"Set to 1 for general debug info" ", to 2 for more details, to 3 to see all the debug messages"

Definition at line 839 of file dict.h.

double tesseract::Dict::doc_dict_certainty_threshold = -2.25

"Worst certainty" " for words that can be inserted into the document dictionary"

Definition at line 902 of file dict.h.

bool tesseract::Dict::doc_dict_enable = 1

"Enable Document Dictionary "

Definition at line 898 of file dict.h.

double tesseract::Dict::doc_dict_pending_threshold = 0.0

"Worst certainty for using pending dictionary"

Definition at line 900 of file dict.h.

int tesseract::Dict::fragments_debug = 0

"Debug character fragments"

Definition at line 870 of file dict.h.

void(Dict::* tesseract::Dict::go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)

Pointer to go_deeper function that will be modified by various permuters.

Definition at line 308 of file dict.h.

int tesseract::Dict::hyphen_debug_level = 0

"Debug level for hyphenated words."

Definition at line 840 of file dict.h.

int(Dict::* tesseract::Dict::letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const

Definition at line 560 of file dict.h.

bool tesseract::Dict::load_bigram_dawg = false

"Load dawg with special word bigrams."

Definition at line 811 of file dict.h.

bool tesseract::Dict::load_fixed_length_dawgs = true

"Load fixed length" " dawgs (e.g. for non-space delimited languages)"

Definition at line 809 of file dict.h.

bool tesseract::Dict::load_freq_dawg = true

"Load frequent word dawg."

Definition at line 803 of file dict.h.

bool tesseract::Dict::load_number_dawg = true

"Load dawg with number patterns."

Definition at line 807 of file dict.h.

bool tesseract::Dict::load_punc_dawg = true

"Load dawg with punctuation patterns."

Definition at line 806 of file dict.h.

bool tesseract::Dict::load_system_dawg = true

"Load system word dawg."

Definition at line 802 of file dict.h.

bool tesseract::Dict::load_unambig_dawg = true

"Load unambiguous word dawg."

Definition at line 804 of file dict.h.

int tesseract::Dict::max_permuter_attempts = 10000

"Maximum number of different" " character choices to consider during permutation." " This limit is especially useful when user patterns" " are specified, since overly generic patterns can result in" " dawg search exploring an overly large number of options."

Definition at line 909 of file dict.h.

int tesseract::Dict::max_viterbi_list_size = 10

"Maximum size of viterbi list."

Definition at line 841 of file dict.h.

bool tesseract::Dict::ngram_permuter_activated = false

"Activate character-level n-gram-based permuter"

Definition at line 904 of file dict.h.

char* tesseract::Dict::output_ambig_words_file = ""

"Output file for ambiguities found in the dictionary"

Definition at line 837 of file dict.h.

bool tesseract::Dict::permute_chartype_word = 0

"Turn on character type (property) consistency permuter"

Definition at line 890 of file dict.h.

bool tesseract::Dict::permute_debug = 0

"Debug char permutation process"

Definition at line 872 of file dict.h.

bool tesseract::Dict::permute_fixed_length_dawg = 0

"Turn on fixed-length phrasebook search permuter"

Definition at line 888 of file dict.h.

bool tesseract::Dict::permute_only_top = false

"Run only the top choice permuter"

Definition at line 910 of file dict.h.

bool tesseract::Dict::permute_script_word = 0

"Turn on word script consistency permuter"

Definition at line 876 of file dict.h.

double(Dict::* tesseract::Dict::probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

Probability in context function used by the ngram permuter.

Definition at line 570 of file dict.h.

bool tesseract::Dict::save_doc_words = 0

"Save Document Words"

Definition at line 897 of file dict.h.

bool tesseract::Dict::save_raw_choices = false

"Save all explored raw choices"

Definition at line 864 of file dict.h.

int tesseract::Dict::segment_debug = 0

"Debug the whole segmentation process"

Definition at line 871 of file dict.h.

bool tesseract::Dict::segment_nonalphabetic_script = false

"Don't use any alphabetic-specific tricks." "Set to true in the traineddata config file for" " scripts that are cursive or inherently fixed-pitch"

Definition at line 882 of file dict.h.

double tesseract::Dict::segment_penalty_dict_case_bad = 1.3125

"Default score multiplier for word matches, which may have " "case issues (lower is better)."

Definition at line 822 of file dict.h.

double tesseract::Dict::segment_penalty_dict_case_ok = 1.1

"Score multiplier for word matches that have good case " "(lower is better)."

Definition at line 818 of file dict.h.

double tesseract::Dict::segment_penalty_dict_frequent_word = 1.0

"Score multiplier for word matches which have good case and" "are frequent in the given language (lower is better)."

Definition at line 814 of file dict.h.

double tesseract::Dict::segment_penalty_dict_nonword = 1.25

"Score multiplier for glyph fragment segmentations which " "do not match a dictionary word (lower is better)."

Definition at line 830 of file dict.h.

double tesseract::Dict::segment_penalty_garbage = 1.50

"Score multiplier for poorly cased strings that are not in" " the dictionary and generally look like garbage (lower is" " better)."

Definition at line 835 of file dict.h.

double tesseract::Dict::segment_penalty_ngram_best_choice = 1.24

"Multipler to for the best choice from the ngram model."

Definition at line 826 of file dict.h.

double tesseract::Dict::segment_reward_chartype = 0.97

"Score multipler for char type consistency within a word. "

Definition at line 892 of file dict.h.

double tesseract::Dict::segment_reward_ngram_best_choice = 0.99

"Score multipler for ngram permuter's best choice" " (only used in the Han script path)."

Definition at line 896 of file dict.h.

double tesseract::Dict::segment_reward_script = 0.95

"Score multipler for script consistency within a word. " "Being a 'reward' factor, it should be <= 1. " "Smaller value implies bigger reward."

Definition at line 886 of file dict.h.

bool tesseract::Dict::segment_segcost_rating = 0

"incorporate segmentation cost in word rating?"

Definition at line 878 of file dict.h.

double tesseract::Dict::stopper_allowable_character_badness = 3.0

"Max certaintly variation allowed in a word (in sigma)"

Definition at line 855 of file dict.h.

double tesseract::Dict::stopper_ambiguity_threshold_gain = 8.0

"Gain factor for ambiguity threshold."

Definition at line 861 of file dict.h.

double tesseract::Dict::stopper_ambiguity_threshold_offset = 1.5

"Certainty offset for ambiguity threshold."

Definition at line 863 of file dict.h.

double tesseract::Dict::stopper_certainty_per_char = -0.50

"Certainty to add for each dict char above small word size."

Definition at line 853 of file dict.h.

int tesseract::Dict::stopper_debug_level = 0

"Stopper debug level"

Definition at line 856 of file dict.h.

bool tesseract::Dict::stopper_no_acceptable_choices = false

"Make AcceptableChoice() always return false. Useful" " when there is a need to explore all segmentations"

Definition at line 859 of file dict.h.

double tesseract::Dict::stopper_nondict_certainty_base = -2.50

"Certainty threshold for non-dict words"

Definition at line 847 of file dict.h.

double tesseract::Dict::stopper_phase2_certainty_rejection_offset = 1.0

"Reject certainty offset"

Definition at line 849 of file dict.h.

int tesseract::Dict::stopper_smallword_size = 2

"Size of dict word to be treated as non-dict word"

Definition at line 851 of file dict.h.

int tesseract::Dict::tessedit_truncate_wordchoice_log = 10

"Max words to keep in list"

Definition at line 865 of file dict.h.

bool tesseract::Dict::use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities."

Definition at line 844 of file dict.h.

char* tesseract::Dict::user_patterns_suffix = ""

"A list of user-provided patterns."

Definition at line 801 of file dict.h.

char* tesseract::Dict::user_words_suffix = ""

Variable members. These have to be declared and initialized after image_ptr_, which contains the pointer to the params vector - the member of its base CCUtil class. "A list of user-provided words."

Definition at line 799 of file dict.h.

char* tesseract::Dict::word_to_debug = ""

"Word for which stopper debug information" " should be printed to stdout"

Definition at line 867 of file dict.h.

char* tesseract::Dict::word_to_debug_lengths = ""

"Lengths of unichars in word_to_debug"

Definition at line 869 of file dict.h.


The documentation for this class was generated from the following files: