21 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_
22 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_
165 static int Compare(
const void *e1,
const void *e2) {
170 return (ve1->
cost < ve2->
cost) ? -1 : 1;
301 bool fixed_pitch,
float best_choice_cert,
302 float max_char_wh_ratio,
float rating_cert_scale,
330 int curr_col,
int curr_row,
331 BLOB_CHOICE_LIST *curr_list,
332 BLOB_CHOICE_LIST *parent_list,
359 int col,
int row,
float best_choice_cert,
371 float best_choice_cert,
396 float priority_adjustment,
397 float worst_piece_cert,
399 float best_choice_cert,
400 float max_char_wh_ratio,
414 float *cert,
bool *fragmented) {
446 return 1.0f / (1.0f + exp(10.0
f * cert));
448 return (-1.0
f / cert);
456 col > 0 && row+1 < dimension);
473 float *cert,
bool *fragmented) {
475 BLOB_CHOICE_IT bit(blist);
476 while (!bit.at_last() &&
IsFragment(bit.data())) {
482 if (bit.data()->certainty() < *cert) *cert = bit.data()->certainty();
486 if (num_problems == 0)
return 0.0f;
487 if (num_problems == 1)
return penalty;
489 static_cast<float>(num_problems-1)));
499 if (dawg_info !=
NULL) {
523 dawg_info, consistency_info)));
530 float ratings_sum,
int length,
float dawg_score,
547 BLOB_CHOICE_LIST *curr_list,
561 int curr_col,
int curr_row,
597 int curr_col,
int curr_row,
610 float certainty,
float denom,
611 int curr_col,
int curr_row,
624 const char *context,
int *unichar_step_len,
625 bool *found_small_prob,
float *ngram_prob);
693 int word_index,
int word_length,
694 int *skip,
int *covered,
696 bool *dawg_score_done);
700 float max_char_wh_ratio,
707 (parent_vse !=
NULL) ? parent_vse->
length : 0,
721 if (top_choice_flags)
return false;
722 if (dawg_info !=
NULL &&
723 (dawg_info->
permuter == SYSTEM_DAWG_PERM ||
724 dawg_info->
permuter == USER_DAWG_PERM ||
725 dawg_info->
permuter == FREQ_DAWG_PERM) &&
740 "Turn on/off the use of character ngram model");
742 "Maximum order of the character ngram model");
744 "Maximum number of prunable (those for which PrunablePath() is true)"
745 "entries in each viterbi list recorded in BLOB_CHOICEs");
747 "Maximum size of viterbi lists recorded in BLOB_CHOICEs");
749 "To avoid overly small denominators use this as the floor"
750 " of the probability returned by the ngram model");
752 "Average classifier score of a non-matching unichar");
754 "Use only the first UTF8 step of the given string"
755 " when computing log probabilities");
757 "Strength of the character ngram model relative to the"
758 " character classifier ");
760 "Words are delimited by space");
763 "Minimum length of compound words");
765 "Depth of blob choice lists to explore"
766 " when fixed length dawgs are on");
769 "Penalty for words not in the frequent word dictionary");
771 "Penalty for non-dictionary words");
773 "Penalty for inconsistent punctuation");
775 "Penalty for inconsistent case");
777 "Penalty for inconsistent script");
779 "Penalty for inconsistent character type");
781 "Penalty for inconsistent font");
783 "Penalty for inconsistent spacing");
786 "Use sigmoidal score for certainty");
852 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_
LanguageModelState(int col, int row)
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
void GetTopChoiceLowerUpper(LanguageModelFlagsType changed, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper)
int NumInconsistentCase() const
int context_unichar_step_len
bool get_isalpha(UNICHAR_ID unichar_id) const
double language_model_ngram_nonmatch_score
ViterbiStateEntry * best_vse
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
float CertaintyScore(float cert)
void GenerateProblematicPathPainPointsFromColumn(int col, int row, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record)
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info)
LanguageModelNgramInfo * ngram_info
static const LanguageModelFlagsType kConsistentFlag
WERD_CHOICE * ConstructWord(BLOB_CHOICE *b, ViterbiStateEntry *vse, CHUNKS_RECORD *chunks_record, BLOB_CHOICE_LIST_VECTOR *best_char_choices, float certainties[], float *dawg_score, STATE *state, BlamerBundle *blamer_bundle, bool *truth_path)
bool PrunablePath(LanguageModelFlagsType top_choice_flags, const LanguageModelDawgInfo *dawg_info)
int language_model_viterbi_list_max_size
float viterbi_state_entries_prunable_max_cost
void GenerateTopChoiceInfo(float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info, const ViterbiStateEntry *parent_vse, BLOB_CHOICE *b, LanguageModelFlagsType *top_choice_flags, LanguageModelFlagsType *changed)
LanguageModelNgramInfo(const char *c, int l, bool p, float np, float nc)
#define BOOL_VAR_H(name, val, comment)
DawgInfoVector * beginning_active_dawgs_
int language_model_ngram_order
GenericVector< bool * > updated_flags_
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int script_id, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse, LanguageModelFlagsType *changed)
bool language_model_ngram_on
const int GetMaxFixedLengthDawgIndex() const
int NumInconsistentSpaces() const
ViterbiStateEntry * parent_vse
LanguageModelDawgInfo(DawgInfoVector *a, DawgInfoVector *c, PermuterType pt)
double language_model_ngram_scale_factor
int prev_word_unichar_step_len_
void GeneratePainPointsFromBestChoice(HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle)
DawgInfoVector * constraints
LanguageModelConsistencyInfo()
static const float kDefaultPainPointPriorityAdjustment
bool acceptable_choice_found_
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, CHUNKS_RECORD *chunks_record, LanguageModelConsistencyInfo *consistency_info)
const UNICHARSET & getUnicharset() const
double language_model_penalty_chartype
float max_penalty_adjust_
const UnicityTable< FontInfo > * fontinfo_table_
T get(int column, int row) const
double language_model_penalty_punc
int language_model_debug_level
UNICHAR_ID unichar_id() const
double language_model_penalty_spacing
bool get_isdigit(UNICHAR_ID unichar_id) const
void GeneratePainPointsFromColumn(int col, const GenericVector< int > &non_empty_rows, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record)
LanguageModelDawgInfo * dawg_info
bool language_model_ngram_space_delimited_language
static const float kBestChoicePainPointPriorityAdjustment
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float best_choice_cert, float max_char_wh_ratio, float rating_cert_scale, HEAP *pain_points, CHUNKS_RECORD *chunks_record, BlamerBundle *blamer_bundle, bool debug_blamer)
static const LanguageModelFlagsType kSmallestRatingFlag
static const float kInitialPainPointPriorityAdjustment
bool ProblematicPath(const ViterbiStateEntry &vse, UNICHAR_ID unichar_id, bool word_end)
bool IsHan(int script_id)
float ComputeConsistencyAdjustedRatingsSum(float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info)
double language_model_penalty_script
void ExtractRawFeaturesFromPath(const ViterbiStateEntry &vse, float *features)
static const LanguageModelFlagsType kAllChangedFlag
double language_model_penalty_non_freq_dict_word
static const float kMaxAvgNgramCost
ViterbiStateEntry * best_vse
double language_model_penalty_font
float ComputeAdjustedPathCost(float ratings_sum, int length, float dawg_score, const LanguageModelDawgInfo *dawg_info, const LanguageModelNgramInfo *ngram_info, const LanguageModelConsistencyInfo &consistency_info, const AssociateStats &associate_stats, ViterbiStateEntry *parent_vse)
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, AssociateStats *associate_stats)
double language_model_penalty_non_dict_word
#define double_VAR_H(name, val, comment)
bool IsFragment(BLOB_CHOICE *b)
float ComputeAdjustment(int num_problems, float penalty)
int NumInconsistentPunc() const
float ComputeOutlineLength(BLOB_CHOICE *b)
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, const DENORM *denorm, CHUNKS_RECORD *chunks_record, int debug_level, AssociateStats *stats)
AssociateStats associate_stats
void GenerateNgramModelPainPointsFromColumn(int col, int row, HEAP *pain_points, CHUNKS_RECORD *chunks_record)
LanguageModelFlagsType AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
bool language_model_use_sigmoidal_certainty
bool correct_segmentation_explored_
int num_inconsistent_spaces
WERD_CHOICE * best_choice
bool AcceptableChoiceFound()
DawgInfoVector * empty_dawg_info_vec_
BLOB_CHOICE_LIST_VECTOR * best_char_choices
static int Compare(const void *e1, const void *e2)
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, const ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, LanguageModelFlagsType *changed)
static const LanguageModelFlagsType kLowerCaseFlag
bool AcceptablePath(const ViterbiStateEntry &vse)
static const float kCriticalPainPointPriorityAdjustment
int NumInconsistentChartype() const
DawgInfoVector * beginning_constraints_
int language_model_fixed_length_choices_depth
void GetPieceCertainty(BLOB_CHOICE_LIST *blist, float *cert, bool *fragmented)
void UpdateCoveredByFixedLengthDawgs(const DawgInfoVector &active_dawgs, int word_index, int word_length, int *skip, int *covered, float *dawg_score, bool *dawg_score_done)
void DeleteState(BLOB_CHOICE_LIST *choices)
static const LanguageModelFlagsType kNgramFlag
static const LanguageModelFlagsType kUpperCaseFlag
static const LanguageModelFlagsType kJustClassifiedFlag
int viterbi_state_entries_length
ViterbiStateEntry(BLOB_CHOICE *pb, ViterbiStateEntry *pe, BLOB_CHOICE *b, float c, float ol, const LanguageModelConsistencyInfo &ci, const AssociateStats &as, LanguageModelFlagsType tcf, LanguageModelDawgInfo *d, LanguageModelNgramInfo *n)
int viterbi_state_entries_prunable_length
DawgInfoVector * active_dawgs
ViterbiStateEntry_LIST viterbi_state_entries
bool NonAlphaOrDigitMiddle(int col, int row, int dimension, UNICHAR_ID unichar_id)
LanguageModelFlagsType UpdateState(LanguageModelFlagsType changed, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE_LIST *parent_list, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
#define INT_VAR_H(name, val, comment)
void GetWorstPieceCertainty(int col, int row, MATRIX *ratings, float *cert, bool *fragmented)
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
int language_model_viterbi_list_max_num_prunable
int language_model_min_compound_length
bool GeneratePainPoint(int col, int row, bool ok_to_extend, float priority_adjustment, float worst_piece_cert, bool fragmented, float best_choice_cert, float max_char_wh_ratio, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, HEAP *pain_points)
void UpdateBestChoice(BLOB_CHOICE *b, ViterbiStateEntry *vse, HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
static const int kMinFixedLengthDawgLength
bool language_model_ngram_use_only_first_uft8_step
LanguageModelConsistencyInfo consistency_info
double language_model_penalty_increment
unsigned char LanguageModelFlagsType
LanguageModelFlagsType top_choice_flags
static const LanguageModelFlagsType kDawgFlag
BestChoiceBundle(STATE *s, WERD_CHOICE *bc, WERD_CHOICE *rc, BLOB_CHOICE_LIST_VECTOR *bcc)
DawgInfoVector * fixed_length_beginning_active_dawgs_
double language_model_ngram_small_prob
static const float kLooseMaxCharWhRatio
double language_model_penalty_case
void PrintViterbiStateEntry(const char *msg, ViterbiStateEntry *vse, BLOB_CHOICE *b, CHUNKS_RECORD *chunks_record)