Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::LanguageModel Class Reference

#include <language_model.h>

Public Member Functions

 LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
 
 ~LanguageModel ()
 
void InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float best_choice_cert, float max_char_wh_ratio, float rating_cert_scale, HEAP *pain_points, CHUNKS_RECORD *chunks_record, BlamerBundle *blamer_bundle, bool debug_blamer)
 
void CleanUp ()
 
void DeleteState (BLOB_CHOICE_LIST *choices)
 
LanguageModelFlagsType UpdateState (LanguageModelFlagsType changed, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE_LIST *parent_list, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void GenerateNgramModelPainPointsFromColumn (int col, int row, HEAP *pain_points, CHUNKS_RECORD *chunks_record)
 
void GenerateProblematicPathPainPointsFromColumn (int col, int row, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record)
 
void GeneratePainPointsFromColumn (int col, const GenericVector< int > &non_empty_rows, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record)
 
void GeneratePainPointsFromBestChoice (HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle)
 
bool GeneratePainPoint (int col, int row, bool ok_to_extend, float priority_adjustment, float worst_piece_cert, bool fragmented, float best_choice_cert, float max_char_wh_ratio, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, HEAP *pain_points)
 
bool AcceptableChoiceFound ()
 
void GetWorstPieceCertainty (int col, int row, MATRIX *ratings, float *cert, bool *fragmented)
 
float ComputeOutlineLength (BLOB_CHOICE *b)
 

Public Attributes

int language_model_debug_level = 0
 
bool language_model_ngram_on = false
 
int language_model_ngram_order = 8
 
int language_model_viterbi_list_max_num_prunable = 10
 
int language_model_viterbi_list_max_size = 500
 
double language_model_ngram_small_prob = 0.000001
 
double language_model_ngram_nonmatch_score = -40.0
 
bool language_model_ngram_use_only_first_uft8_step = false
 
double language_model_ngram_scale_factor = 0.03
 
bool language_model_ngram_space_delimited_language = true
 
int language_model_min_compound_length = 3
 
int language_model_fixed_length_choices_depth = 3
 
double language_model_penalty_non_freq_dict_word = 0.1
 
double language_model_penalty_non_dict_word = 0.15
 
double language_model_penalty_punc = 0.2
 
double language_model_penalty_case = 0.1
 
double language_model_penalty_script = 0.5
 
double language_model_penalty_chartype = 0.3
 
double language_model_penalty_font = 0.00
 
double language_model_penalty_spacing = 0.05
 
double language_model_penalty_increment = 0.01
 
bool language_model_use_sigmoidal_certainty = false
 

Static Public Attributes

static const float kInitialPainPointPriorityAdjustment = 5.0f
 
static const float kDefaultPainPointPriorityAdjustment = 2.0f
 
static const float kBestChoicePainPointPriorityAdjustment = 0.5f
 
static const float kCriticalPainPointPriorityAdjustment = 0.1f
 
static const float kMaxAvgNgramCost = 25.0f
 
static const int kMinFixedLengthDawgLength = 2
 
static const float kLooseMaxCharWhRatio = 2.5f
 
static const LanguageModelFlagsType kSmallestRatingFlag = 0x1
 
static const LanguageModelFlagsType kLowerCaseFlag = 0x2
 
static const LanguageModelFlagsType kUpperCaseFlag = 0x4
 
static const LanguageModelFlagsType kConsistentFlag = 0x8
 
static const LanguageModelFlagsType kDawgFlag = 0x10
 
static const LanguageModelFlagsType kNgramFlag = 0x20
 
static const LanguageModelFlagsType kJustClassifiedFlag = 0x80
 
static const LanguageModelFlagsType kAllChangedFlag = 0xff
 

Protected Member Functions

float CertaintyScore (float cert)
 
bool NonAlphaOrDigitMiddle (int col, int row, int dimension, UNICHAR_ID unichar_id)
 
bool IsFragment (BLOB_CHOICE *b)
 
bool IsHan (int script_id)
 
void GetPieceCertainty (BLOB_CHOICE_LIST *blist, float *cert, bool *fragmented)
 
float ComputeAdjustment (int num_problems, float penalty)
 
float ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info)
 
float ComputeConsistencyAdjustedRatingsSum (float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info)
 
float ComputeAdjustedPathCost (float ratings_sum, int length, float dawg_score, const LanguageModelDawgInfo *dawg_info, const LanguageModelNgramInfo *ngram_info, const LanguageModelConsistencyInfo &consistency_info, const AssociateStats &associate_stats, ViterbiStateEntry *parent_vse)
 
bool ProblematicPath (const ViterbiStateEntry &vse, UNICHAR_ID unichar_id, bool word_end)
 
void GetTopChoiceLowerUpper (LanguageModelFlagsType changed, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper)
 
LanguageModelFlagsType AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void PrintViterbiStateEntry (const char *msg, ViterbiStateEntry *vse, BLOB_CHOICE *b, CHUNKS_RECORD *chunks_record)
 
void GenerateTopChoiceInfo (float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info, const ViterbiStateEntry *parent_vse, BLOB_CHOICE *b, LanguageModelFlagsType *top_choice_flags, LanguageModelFlagsType *changed)
 
LanguageModelDawgInfoGenerateDawgInfo (bool word_end, int script_id, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse, LanguageModelFlagsType *changed)
 
LanguageModelNgramInfoGenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, const ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, LanguageModelFlagsType *changed)
 
float ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
 
float ComputeDenom (BLOB_CHOICE_LIST *curr_list)
 
void FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, CHUNKS_RECORD *chunks_record, LanguageModelConsistencyInfo *consistency_info)
 
void UpdateBestChoice (BLOB_CHOICE *b, ViterbiStateEntry *vse, HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ExtractRawFeaturesFromPath (const ViterbiStateEntry &vse, float *features)
 
WERD_CHOICEConstructWord (BLOB_CHOICE *b, ViterbiStateEntry *vse, CHUNKS_RECORD *chunks_record, BLOB_CHOICE_LIST_VECTOR *best_char_choices, float certainties[], float *dawg_score, STATE *state, BlamerBundle *blamer_bundle, bool *truth_path)
 
void UpdateCoveredByFixedLengthDawgs (const DawgInfoVector &active_dawgs, int word_index, int word_length, int *skip, int *covered, float *dawg_score, bool *dawg_score_done)
 
void ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, AssociateStats *associate_stats)
 
bool PrunablePath (LanguageModelFlagsType top_choice_flags, const LanguageModelDawgInfo *dawg_info)
 
bool AcceptablePath (const ViterbiStateEntry &vse)
 

Protected Attributes

DawgArgsdawg_args_
 
GenericVector< bool * > updated_flags_
 
float rating_cert_scale_
 
const UnicityTable< FontInfo > * fontinfo_table_
 
Dictdict_
 
bool fixed_pitch_
 
float max_char_wh_ratio_
 
STRING prev_word_str_
 
int prev_word_unichar_step_len_
 
DawgInfoVectorbeginning_active_dawgs_
 
DawgInfoVectorbeginning_constraints_
 
DawgInfoVectorfixed_length_beginning_active_dawgs_
 
DawgInfoVectorempty_dawg_info_vec_
 
float max_penalty_adjust_
 
bool acceptable_choice_found_
 
bool correct_segmentation_explored_
 

Detailed Description

Definition at line 264 of file language_model.h.

Constructor & Destructor Documentation

tesseract::LanguageModel::LanguageModel ( const UnicityTable< FontInfo > *  fontinfo_table,
Dict dict 
)

Definition at line 43 of file language_model.cpp.

45  : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
46  dict->getImage()->getCCUtil()->params()),
48  "Turn on/off the use of character ngram model",
49  dict->getImage()->getCCUtil()->params()),
51  "Maximum order of the character ngram model",
52  dict->getImage()->getCCUtil()->params()),
54  "Maximum number of prunable (those for which"
55  " PrunablePath() is true) entries in each viterbi list"
56  " recorded in BLOB_CHOICEs",
57  dict->getImage()->getCCUtil()->params()),
59  "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
60  dict->getImage()->getCCUtil()->params()),
62  "To avoid overly small denominators use this as the "
63  "floor of the probability returned by the ngram model.",
64  dict->getImage()->getCCUtil()->params()),
66  "Average classifier score of a non-matching unichar.",
67  dict->getImage()->getCCUtil()->params()),
69  "Use only the first UTF8 step of the given string"
70  " when computing log probabilities.",
71  dict->getImage()->getCCUtil()->params()),
73  "Strength of the character ngram model relative to the"
74  " character classifier ",
75  dict->getImage()->getCCUtil()->params()),
77  "Words are delimited by space",
78  dict->getImage()->getCCUtil()->params()),
80  "Minimum length of compound words",
81  dict->getImage()->getCCUtil()->params()),
83  "Depth of blob choice lists to explore"
84  " when fixed length dawgs are on",
85  dict->getImage()->getCCUtil()->params()),
87  "Penalty for words not in the frequent word dictionary",
88  dict->getImage()->getCCUtil()->params()),
90  "Penalty for non-dictionary words",
91  dict->getImage()->getCCUtil()->params()),
93  "Penalty for inconsistent punctuation",
94  dict->getImage()->getCCUtil()->params()),
96  "Penalty for inconsistent case",
97  dict->getImage()->getCCUtil()->params()),
99  "Penalty for inconsistent script",
100  dict->getImage()->getCCUtil()->params()),
102  "Penalty for inconsistent character type",
103  dict->getImage()->getCCUtil()->params()),
104  // TODO(daria, rays): enable font consistency checking
105  // after improving font analysis.
107  "Penalty for inconsistent font",
108  dict->getImage()->getCCUtil()->params()),
110  "Penalty for inconsistent spacing",
111  dict->getImage()->getCCUtil()->params()),
113  "Penalty increment",
114  dict->getImage()->getCCUtil()->params()),
116  "Use sigmoidal score for certainty",
117  dict->getImage()->getCCUtil()->params()),
118  fontinfo_table_(fontinfo_table), dict_(dict),
119  fixed_pitch_(false), max_char_wh_ratio_(0.0),
120  acceptable_choice_found_(false) {
121  ASSERT_HOST(dict_ != NULL);
122  dawg_args_ = new DawgArgs(NULL, NULL, new DawgInfoVector(),
123  new DawgInfoVector(),
124  0.0, NO_PERM, kAnyWordLength, -1);
125  beginning_active_dawgs_ = new DawgInfoVector();
126  beginning_constraints_ = new DawgInfoVector();
127  fixed_length_beginning_active_dawgs_ = new DawgInfoVector();
128  empty_dawg_info_vec_ = new DawgInfoVector();
129 }
double language_model_ngram_nonmatch_score
DawgInfoVector * beginning_active_dawgs_
#define NULL
Definition: host.h:144
const UnicityTable< FontInfo > * fontinfo_table_
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:272
bool language_model_ngram_space_delimited_language
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:287
double language_model_penalty_non_freq_dict_word
double language_model_penalty_non_dict_word
DawgInfoVector * empty_dawg_info_vec_
DawgInfoVector * beginning_constraints_
int language_model_viterbi_list_max_num_prunable
bool language_model_ngram_use_only_first_uft8_step
#define ASSERT_HOST(x)
Definition: errcode.h:84
DawgInfoVector * fixed_length_beginning_active_dawgs_
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:275
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:281
tesseract::LanguageModel::~LanguageModel ( )

Definition at line 131 of file language_model.cpp.

131  {
133  delete beginning_constraints_;
135  delete empty_dawg_info_vec_;
138  delete dawg_args_;
139 }
DawgInfoVector * beginning_active_dawgs_
DawgInfoVector * updated_constraints
Definition: dict.h:80
DawgInfoVector * empty_dawg_info_vec_
DawgInfoVector * beginning_constraints_
DawgInfoVector * updated_active_dawgs
Definition: dict.h:79
DawgInfoVector * fixed_length_beginning_active_dawgs_

Member Function Documentation

bool tesseract::LanguageModel::AcceptableChoiceFound ( )
inline

Definition at line 407 of file language_model.h.

bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 731 of file language_model.h.

731  {
732  return (vse.dawg_info != NULL || vse.Consistent() ||
733  (vse.ngram_info != NULL && !vse.ngram_info->pruned));
734  }
#define NULL
Definition: host.h:144
LanguageModelFlagsType tesseract::LanguageModel::AddViterbiStateEntry ( LanguageModelFlagsType  top_choice_flags,
float  denom,
bool  word_end,
int  curr_col,
int  curr_row,
BLOB_CHOICE b,
BLOB_CHOICE parent_b,
ViterbiStateEntry parent_vse,
HEAP pain_points,
BestPathByColumn best_path_by_column[],
CHUNKS_RECORD chunks_record,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 450 of file language_model.cpp.

462  {
463  ViterbiStateEntry_IT vit;
464  if (language_model_debug_level > 0) {
465  tprintf("\nAddViterbiStateEntry for unichar %s rating=%.4f"
466  " certainty=%.4f top_choice_flags=0x%x parent_vse=%p\n",
468  b->rating(), b->certainty(), top_choice_flags, parent_vse);
470  tprintf("Existing viterbi list:\n");
471  vit.set_to_list(&(reinterpret_cast<LanguageModelState *>(
472  b->language_model_state())->viterbi_state_entries));
473  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
474  PrintViterbiStateEntry("", vit.data(), b, chunks_record);
475  }
476  }
477  }
478  // Check whether the list is full.
479  if (b->language_model_state() != NULL &&
480  (reinterpret_cast<LanguageModelState *>(
481  b->language_model_state())->viterbi_state_entries_length >=
483  if (language_model_debug_level > 1) {
484  tprintf("AddViterbiStateEntry: viterbi list is full!\n");
485  }
486  return 0x0;
487  }
488 
489  LanguageModelFlagsType changed = 0x0;
490  float optimistic_cost = 0.0f;
491  if (!language_model_ngram_on) optimistic_cost += b->rating();
492  if (parent_vse != NULL) optimistic_cost += parent_vse->cost;
493  // Discard this entry if it will not beat best choice.
494  if (optimistic_cost >= best_choice_bundle->best_choice->rating()) {
495  if (language_model_debug_level > 1) {
496  tprintf("Discarded ViterbiEntry with high cost %.4f"
497  " best_choice->rating()=%.4f\n", optimistic_cost,
498  best_choice_bundle->best_choice->rating());
499  }
500  return 0x0;
501  }
502 
503  // Check consistency of the path and set the relevant consistency_info.
504  LanguageModelConsistencyInfo consistency_info;
505  FillConsistencyInfo(curr_col, word_end, b, parent_vse, parent_b,
506  chunks_record, &consistency_info);
507 
508  // Invoke Dawg language model component.
509  LanguageModelDawgInfo *dawg_info =
510  GenerateDawgInfo(word_end, consistency_info.script_id,
511  curr_col, curr_row, *b, parent_vse, &changed);
512 
513  // Invoke TopChoice language model component
514  float ratings_sum = b->rating();
515  if (parent_vse != NULL) ratings_sum += parent_vse->ratings_sum;
516  GenerateTopChoiceInfo(ratings_sum, dawg_info, consistency_info,
517  parent_vse, b, &top_choice_flags, &changed);
518 
519  // Invoke Ngram language model component.
520  LanguageModelNgramInfo *ngram_info = NULL;
522  ngram_info = GenerateNgramInfo(
524  denom, curr_col, curr_row, parent_vse, parent_b, &changed);
525  ASSERT_HOST(ngram_info != NULL);
526  }
527 
528  // Prune non-top choice paths with inconsistent scripts.
529  if (consistency_info.inconsistent_script) {
530  if (!(top_choice_flags & kSmallestRatingFlag)) changed = 0x0;
531  if (ngram_info != NULL) ngram_info->pruned = true;
532  }
533 
534  // If language model components did not like this unichar - return
535  if (!changed) {
536  if (language_model_debug_level > 0) {
537  tprintf("Language model components did not like this entry\n");
538  }
539  delete dawg_info;
540  delete ngram_info;
541  return 0x0;
542  }
543 
544  // Compute cost of associating the blobs that represent the current unichar.
545  AssociateStats associate_stats;
546  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
547  parent_vse, chunks_record, &associate_stats);
548  if (parent_vse != NULL) {
549  associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
550  associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
551  }
552 
553  // Compute the aggregate cost (adjusted ratings sum).
554  float cost = ComputeAdjustedPathCost(
555  ratings_sum,
556  (parent_vse == NULL) ? 1 : (parent_vse->length+1),
557  (dawg_info == NULL) ? 0.0f : 1.0f,
558  dawg_info, ngram_info, consistency_info, associate_stats, parent_vse);
559 
560  if (b->language_model_state() == NULL) {
561  b->set_language_model_state(new LanguageModelState(curr_col, curr_row));
562  }
563  LanguageModelState *lms =
564  reinterpret_cast<LanguageModelState *>(b->language_model_state());
565 
566  // Discard this entry if it represents a prunable path and
567  // language_model_viterbi_list_max_num_prunable such entries with a lower
568  // cost have already been recorded.
569  if (PrunablePath(top_choice_flags, dawg_info) &&
570  (lms->viterbi_state_entries_prunable_length >=
572  cost >= lms->viterbi_state_entries_prunable_max_cost) {
573  if (language_model_debug_level > 1) {
574  tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
575  cost, lms->viterbi_state_entries_prunable_max_cost);
576  }
577  delete dawg_info;
578  delete ngram_info;
579  return 0x0;
580  }
581 
582  // Create the new ViterbiStateEntry and add it to lms->viterbi_state_entries
583  ViterbiStateEntry *new_vse = new ViterbiStateEntry(
584  parent_b, parent_vse, b, cost, ComputeOutlineLength(b), consistency_info,
585  associate_stats, top_choice_flags, dawg_info, ngram_info);
586  updated_flags_.push_back(&(new_vse->updated));
587  lms->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
588  false, new_vse);
589  lms->viterbi_state_entries_length++;
590  if (PrunablePath(top_choice_flags, dawg_info)) {
591  lms->viterbi_state_entries_prunable_length++;
592  }
593 
594  // Update lms->viterbi_state_entries_prunable_max_cost and clear
595  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
596  if ((lms->viterbi_state_entries_prunable_length >=
597  language_model_viterbi_list_max_num_prunable) || top_choice_flags) {
598  ASSERT_HOST(!lms->viterbi_state_entries.empty());
599  int prunable_counter = language_model_viterbi_list_max_num_prunable;
600  vit.set_to_list(&(lms->viterbi_state_entries));
601  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
602  ViterbiStateEntry *curr_vse = vit.data();
603  // Clear the appropriate top choice flags of the entries in the
604  // list that have ratings_sum higher thank new_entry->ratings_sum
605  // (since they will not be top choices any more).
606  if (curr_vse->top_choice_flags && curr_vse != new_vse &&
608  curr_vse->ratings_sum, curr_vse->dawg_info,
609  curr_vse->consistency_info) >
611  new_vse->ratings_sum, new_vse->dawg_info,
612  new_vse->consistency_info)) {
613  curr_vse->top_choice_flags &= ~(top_choice_flags);
614  }
615  if (prunable_counter > 0 &&
616  PrunablePath(curr_vse->top_choice_flags, curr_vse->dawg_info)) {
617  --prunable_counter;
618  }
619  // Update lms->viterbi_state_entries_prunable_max_cost.
620  if (prunable_counter == 0) {
621  lms->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
622  if (language_model_debug_level > 1) {
623  tprintf("Set viterbi_state_entries_prunable_max_cost to %.4f\n",
624  lms->viterbi_state_entries_prunable_max_cost);
625  }
626  prunable_counter = -1; // stop counting
627  }
628  }
629  }
630 
631  // Print the newly created ViterbiStateEntry.
632  if (language_model_debug_level > 2) {
633  PrintViterbiStateEntry("New", new_vse, b, chunks_record);
634  if (language_model_debug_level > 3) {
635  tprintf("Updated viterbi list (max cost %g):\n",
636  lms->viterbi_state_entries_prunable_max_cost);
637  vit.set_to_list(&(lms->viterbi_state_entries));
638  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
639  PrintViterbiStateEntry("", vit.data(), b, chunks_record);
640  }
641  }
642  }
643 
644  // Update best choice if needed.
645  if (word_end) {
646  UpdateBestChoice(b, new_vse, pain_points, chunks_record,
647  best_choice_bundle, blamer_bundle);
648  }
649 
650  // Update stats in best_path_by_column.
651  if (new_vse->Consistent() || new_vse->dawg_info != NULL ||
652  (new_vse->ngram_info != NULL && !new_vse->ngram_info->pruned)) {
653  float avg_cost = new_vse->cost / static_cast<float>(curr_row+1);
654  for (int c = curr_col; c <= curr_row; ++c) {
655  if (avg_cost < (*best_path_by_column)[c].avg_cost) {
656  (*best_path_by_column)[c].avg_cost = avg_cost;
657  (*best_path_by_column)[c].best_vse = new_vse;
658  (*best_path_by_column)[c].best_b = b;
659  if (language_model_debug_level > 0) {
660  tprintf("Set best_path_by_column[%d]=(%g %p)\n",
661  c, avg_cost, new_vse);
662  }
663  }
664  }
665  }
666  return changed;
667 }
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
bool PrunablePath(LanguageModelFlagsType top_choice_flags, const LanguageModelDawgInfo *dawg_info)
void GenerateTopChoiceInfo(float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info, const ViterbiStateEntry *parent_vse, BLOB_CHOICE *b, LanguageModelFlagsType *top_choice_flags, LanguageModelFlagsType *changed)
GenericVector< bool * > updated_flags_
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int script_id, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse, LanguageModelFlagsType *changed)
#define NULL
Definition: host.h:144
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, CHUNKS_RECORD *chunks_record, LanguageModelConsistencyInfo *consistency_info)
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
int push_back(T object)
static const LanguageModelFlagsType kSmallestRatingFlag
float ComputeConsistencyAdjustedRatingsSum(float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info)
float ComputeAdjustedPathCost(float ratings_sum, int length, float dawg_score, const LanguageModelDawgInfo *dawg_info, const LanguageModelNgramInfo *ngram_info, const LanguageModelConsistencyInfo &consistency_info, const AssociateStats &associate_stats, ViterbiStateEntry *parent_vse)
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, AssociateStats *associate_stats)
float ComputeOutlineLength(BLOB_CHOICE *b)
float certainty() const
Definition: ratngs.h:65
static int Compare(const void *e1, const void *e2)
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, const ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, LanguageModelFlagsType *changed)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void * language_model_state()
Definition: ratngs.h:77
int language_model_viterbi_list_max_num_prunable
void UpdateBestChoice(BLOB_CHOICE *b, ViterbiStateEntry *vse, HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
#define ASSERT_HOST(x)
Definition: errcode.h:84
unsigned char LanguageModelFlagsType
void PrintViterbiStateEntry(const char *msg, ViterbiStateEntry *vse, BLOB_CHOICE *b, CHUNKS_RECORD *chunks_record)
void set_language_model_state(void *language_model_state)
Definition: ratngs.h:114
float rating() const
Definition: ratngs.h:62
float tesseract::LanguageModel::CertaintyScore ( float  cert)
inlineprotected

Definition at line 440 of file language_model.h.

440  {
442  // cert is assumed to be between 0 and -dict_->certainty_scale.
443  // If you enable language_model_use_sigmoidal_certainty, you
444  // need to adjust language_model_ngram_nonmatch_score as well.
445  cert = -cert / dict_->certainty_scale;
446  return 1.0f / (1.0f + exp(10.0f * cert));
447  } else {
448  return (-1.0f / cert);
449  }
450  }
double certainty_scale
Definition: dict.h:845
#define f(xc, yc)
Definition: imgscale.cpp:39
void tesseract::LanguageModel::CleanUp ( )

Definition at line 273 of file language_model.cpp.

273  {
274  for (int i = 0; i < updated_flags_.size(); ++i) {
275  *(updated_flags_[i]) = false;
276  }
278 }
virtual void clear()
GenericVector< bool * > updated_flags_
int size() const
Definition: genericvector.h:59
float tesseract::LanguageModel::ComputeAdjustedPathCost ( float  ratings_sum,
int  length,
float  dawg_score,
const LanguageModelDawgInfo dawg_info,
const LanguageModelNgramInfo ngram_info,
const LanguageModelConsistencyInfo consistency_info,
const AssociateStats associate_stats,
ViterbiStateEntry parent_vse 
)
protected

Definition at line 1152 of file language_model.cpp.

1158  {
1159  float adjustment = 1.0f;
1160  if (dawg_info == NULL || dawg_info->permuter != FREQ_DAWG_PERM) {
1162  }
1163  if (dawg_score == 0.0f) {
1165  if (length > language_model_min_compound_length) {
1166  adjustment += ((length - language_model_min_compound_length) *
1168  }
1169  } else if (dawg_score < 1.0f) {
1170  adjustment += (1.0f - dawg_score) * language_model_penalty_non_dict_word;
1171  }
1172  if (associate_stats.shape_cost > 0) {
1173  adjustment += associate_stats.shape_cost / static_cast<float>(length);
1174  }
1176  ASSERT_HOST(ngram_info != NULL);
1177  return ngram_info->ngram_cost * adjustment;
1178  } else {
1179  adjustment += ComputeConsistencyAdjustment(dawg_info, consistency_info);
1180  return ratings_sum * adjustment;
1181  }
1182 }
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info)
#define NULL
Definition: host.h:144
#define f(xc, yc)
Definition: imgscale.cpp:39
double language_model_penalty_non_freq_dict_word
double language_model_penalty_non_dict_word
#define ASSERT_HOST(x)
Definition: errcode.h:84
float tesseract::LanguageModel::ComputeAdjustment ( int  num_problems,
float  penalty 
)
inlineprotected

Definition at line 485 of file language_model.h.

485  {
486  if (num_problems == 0) return 0.0f;
487  if (num_problems == 1) return penalty;
488  return (penalty + (language_model_penalty_increment *
489  static_cast<float>(num_problems-1)));
490  }
void tesseract::LanguageModel::ComputeAssociateStats ( int  col,
int  row,
float  max_char_wh_ratio,
ViterbiStateEntry parent_vse,
CHUNKS_RECORD chunks_record,
AssociateStats associate_stats 
)
inlineprotected

Definition at line 699 of file language_model.h.

703  {
705  col, row,
706  (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
707  (parent_vse != NULL) ? parent_vse->length : 0,
708  fixed_pitch_, max_char_wh_ratio,
709  chunks_record->word_res != NULL ? &chunks_record->word_res->denorm : NULL,
710  chunks_record, language_model_debug_level, associate_stats);
711  }
WERD_RES * word_res
Definition: associate.h:54
#define NULL
Definition: host.h:144
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, const DENORM *denorm, CHUNKS_RECORD *chunks_record, int debug_level, AssociateStats *stats)
Definition: associate.cpp:37
DENORM denorm
Definition: pageres.h:346
float tesseract::LanguageModel::ComputeConsistencyAdjustedRatingsSum ( float  ratings_sum,
const LanguageModelDawgInfo dawg_info,
const LanguageModelConsistencyInfo consistency_info 
)
inlineprotected

Definition at line 518 of file language_model.h.

521  {
522  return (ratings_sum * (1.0f + ComputeConsistencyAdjustment(
523  dawg_info, consistency_info)));
524  }
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info)
#define f(xc, yc)
Definition: imgscale.cpp:39
float tesseract::LanguageModel::ComputeConsistencyAdjustment ( const LanguageModelDawgInfo dawg_info,
const LanguageModelConsistencyInfo consistency_info 
)
inlineprotected

Definition at line 496 of file language_model.h.

498  {
499  if (dawg_info != NULL) {
500  return ComputeAdjustment(consistency_info.NumInconsistentCase(),
502  }
503  return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
505  ComputeAdjustment(consistency_info.NumInconsistentCase(),
507  ComputeAdjustment(consistency_info.NumInconsistentChartype(),
509  ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
511  (consistency_info.inconsistent_script ?
513  (consistency_info.inconsistent_font ?
515  }
#define NULL
Definition: host.h:144
float ComputeAdjustment(int num_problems, float penalty)
float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST *  curr_list)
protected

Definition at line 962 of file language_model.cpp.

962  {
963  if (curr_list->empty()) return 1.0f;
964  float denom = 0.0f;
965  int len = 0;
966  BLOB_CHOICE_IT c_it(curr_list);
967  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
968  ASSERT_HOST(c_it.data() != NULL);
969  ++len;
970  denom += CertaintyScore(c_it.data()->certainty());
971  }
972  assert(len != 0);
973  // The ideal situation would be to have the classifier scores for
974  // classifying each position as each of the characters in the unicharset.
975  // Since we can not do this because of speed, we add a very crude estimate
976  // of what these scores for the "missing" classifications would sum up to.
977  denom += (dict_->getUnicharset().size() - len) *
979 
980  return denom;
981 }
double language_model_ngram_nonmatch_score
float CertaintyScore(float cert)
int size() const
Definition: unicharset.h:264
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
#define f(xc, yc)
Definition: imgscale.cpp:39
#define ASSERT_HOST(x)
Definition: errcode.h:84
float tesseract::LanguageModel::ComputeNgramCost ( const char *  unichar,
float  certainty,
float  denom,
const char *  context,
int *  unichar_step_len,
bool *  found_small_prob,
float *  ngram_prob 
)
protected

Definition at line 905 of file language_model.cpp.

911  {
912  const char *context_ptr = context;
913  char *modified_context = NULL;
914  char *modified_context_end = NULL;
915  const char *unichar_ptr = unichar;
916  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
917  float prob = 0.0f;
918  int step = 0;
919  while (unichar_ptr < unichar_end &&
920  (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
921  if (language_model_debug_level > 1) {
922  tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
923  dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
924  }
925  prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
926  ++(*unichar_step_len);
928  unichar_ptr += step;
929  // If there are multiple UTF8 characters present in unichar, context is
930  // updated to include the previously examined characters from str,
931  // unless use_only_first_uft8_step is true.
932  if (unichar_ptr < unichar_end) {
933  if (modified_context == NULL) {
934  int context_len = strlen(context);
935  modified_context =
936  new char[context_len + strlen(unichar_ptr) + step + 1];
937  strncpy(modified_context, context, context_len);
938  modified_context_end = modified_context + context_len;
939  context_ptr = modified_context;
940  }
941  strncpy(modified_context_end, unichar_ptr - step, step);
942  modified_context_end += step;
943  *modified_context_end = '\0';
944  }
945  }
946  prob /= static_cast<float>(*unichar_step_len); // normalize
947  if (prob < language_model_ngram_small_prob) {
948  if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
949  *found_small_prob = true;
950  }
951  *ngram_prob = -1.0*log(prob);
952  float cost = -1.0*log(CertaintyScore(certainty)/denom) +
953  *ngram_prob * language_model_ngram_scale_factor;
954  if (language_model_debug_level > 1) {
955  tprintf("-log [ p(%s) * p(%s | %s) ] = -log(%g*%g) = %g\n", unichar,
956  unichar, context_ptr, CertaintyScore(certainty)/denom, prob, cost);
957  }
958  if (modified_context != NULL) delete[] modified_context;
959  return cost;
960 }
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:576
float CertaintyScore(float cert)
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:131
#define NULL
Definition: host.h:144
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool language_model_ngram_use_only_first_uft8_step
float tesseract::LanguageModel::ComputeOutlineLength ( BLOB_CHOICE b)
inline

Definition at line 434 of file language_model.h.

434  {
435  return rating_cert_scale_ * b->rating() / b->certainty();
436  }
float certainty() const
Definition: ratngs.h:65
float rating() const
Definition: ratngs.h:62
WERD_CHOICE * tesseract::LanguageModel::ConstructWord ( BLOB_CHOICE b,
ViterbiStateEntry vse,
CHUNKS_RECORD chunks_record,
BLOB_CHOICE_LIST_VECTOR best_char_choices,
float  certainties[],
float *  dawg_score,
STATE state,
BlamerBundle blamer_bundle,
bool *  truth_path 
)
protected

Definition at line 1375 of file language_model.cpp.

1384  {
1385  uinT64 state_uint = 0x0;
1386  if (truth_path != NULL) {
1387  *truth_path =
1388  (blamer_bundle != NULL &&
1389  !blamer_bundle->correct_segmentation_cols.empty() &&
1390  vse->length == blamer_bundle->correct_segmentation_cols.length());
1391  }
1392  const uinT64 kHighestBitOn = 0x8000000000000000LL;
1393  BLOB_CHOICE *curr_b = b;
1394  LanguageModelState *curr_lms =
1395  reinterpret_cast<LanguageModelState *>(b->language_model_state());
1396  ViterbiStateEntry *curr_vse = vse;
1397 
1398  int i;
1399  bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1400  bool dawg_score_done = true;
1401  if (dawg_score != NULL) {
1402  *dawg_score = 0.0f;
1403  // For space-delimited languages the presence of dawg_info in the last
1404  // ViterbyStateEntry on the path means that the whole path represents
1405  // a valid dictionary word.
1406  if (dict_->GetMaxFixedLengthDawgIndex() < 0) {
1407  if (vse->dawg_info != NULL) *dawg_score = 1.0f;
1408  } else if (vse->length == 1) {
1409  *dawg_score = 1.0f; // each one-letter word is legal
1410  dawg_score_done = true; // in non-space delimited languages
1411  } else {
1412  dawg_score_done = false; // do more work to compute dawg_score
1413  }
1414  }
1415  // For non-space delimited languages we compute the fraction of letters
1416  // "covered" by fixed length dawgs (i.e. words of length > 1 on the path).
1417  int covered_by_fixed_length_dawgs = 0;
1418  // The number of unichars remaining that should be skipped because
1419  // they are covered by the previous word from fixed length dawgs.
1420  int fixed_length_num_unichars_to_skip = 0;
1421 
1422  // Re-compute the variance of the width-to-height ratios (since we now
1423  // can compute the mean over the whole word).
1424  float full_wh_ratio_mean = 0.0f;
1425  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1426  vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
1427  full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1428  static_cast<float>(vse->length));
1429  vse->associate_stats.full_wh_ratio_var = 0.0f;
1430  }
1431 
1432  // Construct a WERD_CHOICE by tracing parent pointers.
1433  WERD_CHOICE *word = new WERD_CHOICE(chunks_record->word_res->uch_set,
1434  vse->length);
1435  word->set_length(vse->length);
1436  for (i = (vse->length-1); i >= 0; --i) {
1437  if (blamer_bundle != NULL && truth_path != NULL && *truth_path) {
1438  if ((blamer_bundle->correct_segmentation_cols[i] !=
1439  curr_lms->contained_in_col) ||
1440  (blamer_bundle->correct_segmentation_rows[i] !=
1441  curr_lms->contained_in_row)) {
1442  *truth_path = false;
1443  }
1444  }
1445  word->set_unichar_id(curr_b->unichar_id(), i);
1446  word->set_fragment_length(1, i);
1447  if (certainties != NULL) certainties[i] = curr_b->certainty();
1448  if (best_char_choices != NULL) {
1449  best_char_choices->set(chunks_record->ratings->get(
1450  curr_lms->contained_in_col, curr_lms->contained_in_row), i);
1451  }
1452  if (state != NULL) {
1453  // Record row minus col zeroes in the reverse state to mark the number
1454  // of joins done by using a blob from this cell in the ratings matrix.
1455  state_uint >>= (curr_lms->contained_in_row - curr_lms->contained_in_col);
1456  // Record a one in the reverse state to indicate the split before
1457  // the blob from the next cell in the ratings matrix (unless we are
1458  // at the first cell, in which case there is no next blob).
1459  if (i != 0) {
1460  state_uint >>= 1;
1461  state_uint |= kHighestBitOn;
1462  }
1463  }
1464  // For non-space delimited languages: find word endings recorded while
1465  // trying to separate the current path into words (for words found in
1466  // fixed length dawgs.
1467  if (!dawg_score_done && curr_vse->dawg_info != NULL) {
1468  UpdateCoveredByFixedLengthDawgs(*(curr_vse->dawg_info->active_dawgs),
1469  i, vse->length,
1470  &fixed_length_num_unichars_to_skip,
1471  &covered_by_fixed_length_dawgs,
1472  dawg_score, &dawg_score_done);
1473  }
1474  // Update the width-to-height ratio variance. Useful non-space delimited
1475  // languages to ensure that the blobs are of uniform width.
1476  // Skip leading and trailing punctuation when computing the variance.
1477  if ((full_wh_ratio_mean != 0.0f &&
1478  ((curr_vse != vse && curr_vse->parent_vse != NULL) ||
1479  !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
1480  vse->associate_stats.full_wh_ratio_var +=
1481  pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1482  if (language_model_debug_level > 2) {
1483  tprintf("full_wh_ratio_var += (%g-%g)^2\n",
1484  full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1485  }
1486  }
1487 
1488  // Mark the word as compound if compound permuter was set for any of
1489  // the unichars on the path (usually this will happen for unichars
1490  // that are compounding operators, like "-" and "/").
1491  if (!compound && curr_vse->dawg_info &&
1492  curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
1493 
1494  // Update curr_* pointers.
1495  if (curr_vse->parent_b == NULL) break;
1496  curr_b = curr_vse->parent_b;
1497  curr_lms =
1498  reinterpret_cast<LanguageModelState *>(curr_b->language_model_state());
1499  curr_vse = curr_vse->parent_vse;
1500  }
1501  ASSERT_HOST(i == 0); // check that we recorded all the unichar ids
1502  // Re-adjust shape cost to include the updated width-to-height variance.
1503  if (full_wh_ratio_mean != 0.0f) {
1504  vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
1505  }
1506 
1507  if (state != NULL) {
1508  state_uint >>= (64 - (chunks_record->ratings->dimension()-1));
1509  state->part2 = state_uint;
1510  state_uint >>= 32;
1511  state->part1 = state_uint;
1512  }
1513  word->set_rating(vse->ratings_sum);
1514  word->set_certainty(vse->min_certainty);
1515  if (vse->dawg_info != NULL && dict_->GetMaxFixedLengthDawgIndex() < 0) {
1516  word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1517  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1518  word->set_permuter(NGRAM_PERM);
1519  } else if (vse->top_choice_flags) {
1520  word->set_permuter(TOP_CHOICE_PERM);
1521  } else {
1522  word->set_permuter(NO_PERM);
1523  }
1524  return word;
1525 }
WERD_RES * word_res
Definition: associate.h:54
void set_rating(float new_val)
Definition: ratngs.h:255
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:118
GenericVector< int > correct_segmentation_cols
Definition: pageres.h:190
const int GetMaxFixedLengthDawgIndex() const
Definition: dict.h:616
void set(T t, int index)
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:420
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T get(int column, int row) const
Definition: matrix.h:117
uinT32 part1
Definition: states.h:41
#define f(xc, yc)
Definition: imgscale.cpp:39
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
void set_length(int len)
Definition: ratngs.h:270
void set_fragment_length(char flen, int index)
Definition: ratngs.h:251
float certainty() const
Definition: ratngs.h:65
const UNICHARSET * uch_set
Definition: pageres.h:348
int dimension() const
Definition: matrix.h:190
bool empty() const
Definition: genericvector.h:68
void UpdateCoveredByFixedLengthDawgs(const DawgInfoVector &active_dawgs, int word_index, int word_length, int *skip, int *covered, float *dawg_score, bool *dawg_score_done)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void set_certainty(float new_val)
Definition: ratngs.h:258
unsigned long long int uinT64
Definition: host.h:109
void * language_model_state()
Definition: ratngs.h:77
uinT32 part2
Definition: states.h:42
int length() const
Definition: genericvector.h:63
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:247
MATRIX * ratings
Definition: associate.h:52
#define ASSERT_HOST(x)
Definition: errcode.h:84
GenericVector< int > correct_segmentation_rows
Definition: pageres.h:191
void set_permuter(uinT8 perm)
Definition: ratngs.h:261
void tesseract::LanguageModel::DeleteState ( BLOB_CHOICE_LIST *  choices)

Definition at line 280 of file language_model.cpp.

280  {
281  BLOB_CHOICE_IT b_it(choices);
282  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
283  if (b_it.data()->language_model_state() != NULL) {
284  LanguageModelState *state = reinterpret_cast<LanguageModelState *>(
285  b_it.data()->language_model_state());
286  delete state;
287  b_it.data()->set_language_model_state(NULL);
288  }
289  }
290 }
#define NULL
Definition: host.h:144
void tesseract::LanguageModel::ExtractRawFeaturesFromPath ( const ViterbiStateEntry vse,
float *  features 
)
protected

Definition at line 1330 of file language_model.cpp.

1331  {
1332  for (int f = 0; f < PTRAIN_NUM_RAW_FEATURE_TYPES; ++f) features[f] = 0.0;
1333  // Dictionary-related features.
1334  if (vse.dawg_info != NULL) {
1335  features[PTRAIN_RAW_FEATURE_DICT_MATCH_TYPE] = vse.dawg_info->permuter;
1336 
1337  // Mark as unambiguous if unambig_dawg is found among active dawgs.
1338  for (int d = 0; d < vse.dawg_info->active_dawgs->size(); ++d) {
1339  if (dict_->GetDawg(vse.dawg_info->active_dawgs->get(d).dawg_index) ==
1340  dict_->GetUnambigDawg()) {
1341  features[PTRAIN_RAW_FEATURE_UNAMBIG_DICT_MATCH] = 1.0;
1342  break;
1343  }
1344  }
1345  }
1346  if (vse.associate_stats.shape_cost > 0) {
1347  features[PTRAIN_RAW_FEATURE_SHAPE_COST] = vse.associate_stats.shape_cost;
1348  }
1350  ASSERT_HOST(vse.ngram_info != NULL);
1351  features[PTRAIN_RAW_FEATURE_NGRAM_PROB] = vse.ngram_info->ngram_prob;
1352  }
1353  // Consistency-related features.
1355  vse.consistency_info.NumInconsistentPunc();
1357  vse.consistency_info.NumInconsistentCase();
1359  vse.consistency_info.NumInconsistentChartype();
1361  vse.consistency_info.NumInconsistentSpaces();
1363  vse.consistency_info.inconsistent_script;
1365  vse.consistency_info.inconsistent_font;
1366  // Classifier-related features.
1367  features[PTRAIN_RAW_FEATURE_WORST_CERT] = vse.min_certainty;
1368  features[PTRAIN_RAW_FEATURE_RATING] = vse.ratings_sum;
1369  features[PTRAIN_RAW_FEATURE_ADAPTED] = vse.adapted;
1370  // Normalization-related features.
1371  features[PTRAIN_RAW_FEATURE_NUM_UNICHARS] = vse.length;
1372  features[PTRAIN_RAW_FEATURE_OUTLINE_LEN] = vse.outline_length;
1373 }
#define NULL
Definition: host.h:144
#define f(xc, yc)
Definition: imgscale.cpp:39
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:605
#define ASSERT_HOST(x)
Definition: errcode.h:84
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
Definition: dict.h:609
void tesseract::LanguageModel::FillConsistencyInfo ( int  curr_col,
bool  word_end,
BLOB_CHOICE b,
ViterbiStateEntry parent_vse,
BLOB_CHOICE parent_b,
CHUNKS_RECORD chunks_record,
LanguageModelConsistencyInfo consistency_info 
)
protected

Definition at line 983 of file language_model.cpp.

990  {
991  const UNICHARSET &unicharset = dict_->getUnicharset();
992  UNICHAR_ID unichar_id = b->unichar_id();
993  if (parent_vse != NULL) *consistency_info = parent_vse->consistency_info;
994 
995  // Check punctuation validity.
996  if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
997  if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {
998  if (dict_->compound_marker(unichar_id) && parent_b != NULL &&
999  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1000  unicharset.get_isdigit(parent_b->unichar_id()))) {
1001  // reset punc_ref for compound words
1002  consistency_info->punc_ref = NO_EDGE;
1003  } else {
1004  UNICHAR_ID pattern_unichar_id =
1005  (unicharset.get_isalpha(unichar_id) ||
1006  unicharset.get_isdigit(unichar_id)) ?
1007  Dawg::kPatternUnicharID : unichar_id;
1008  if (consistency_info->punc_ref == NO_EDGE ||
1009  pattern_unichar_id != Dawg::kPatternUnicharID ||
1010  dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1013  consistency_info->punc_ref);
1014  consistency_info->punc_ref =
1015  (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1016  node, pattern_unichar_id, word_end) : NO_EDGE;
1017  if (consistency_info->punc_ref == NO_EDGE) {
1018  consistency_info->invalid_punc = true;
1019  }
1020  }
1021  }
1022  }
1023 
1024  // Update case related counters.
1025  if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {
1026  // Reset counters if we are dealing with a compound word.
1027  consistency_info->num_lower = 0;
1028  consistency_info->num_non_first_upper = 0;
1029  }
1030  else if (unicharset.get_islower(unichar_id)) {
1031  consistency_info->num_lower++;
1032  } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {
1033  if (unicharset.get_isupper(parent_b->unichar_id()) ||
1034  consistency_info->num_lower > 0 ||
1035  consistency_info->num_non_first_upper > 0) {
1036  consistency_info->num_non_first_upper++;
1037  }
1038  }
1039 
1040  // Initialize consistency_info->script_id (use script of unichar_id
1041  // if it is not Common, use script id recorded by the parent otherwise).
1042  // Set inconsistent_script to true if the script of the current unichar
1043  // is not consistent with that of the parent.
1044  consistency_info->script_id = unicharset.get_script(unichar_id);
1045  // Hiragana and Katakana can mix with Han.
1047  if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1048  consistency_info->script_id == unicharset.hiragana_sid()) ||
1049  (unicharset.katakana_sid() != unicharset.null_sid() &&
1050  consistency_info->script_id == unicharset.katakana_sid())) {
1051  consistency_info->script_id = dict_->getUnicharset().han_sid();
1052  }
1053  }
1054 
1055  if (parent_vse != NULL &&
1056  (parent_vse->consistency_info.script_id !=
1057  dict_->getUnicharset().common_sid())) {
1058  int parent_script_id = parent_vse->consistency_info.script_id;
1059  // If script_id is Common, use script id of the parent instead.
1060  if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1061  consistency_info->script_id = parent_script_id;
1062  }
1063  if (consistency_info->script_id != parent_script_id) {
1064  consistency_info->inconsistent_script = true;
1065  }
1066  }
1067 
1068  // Update chartype related counters.
1069  if (unicharset.get_isalpha(unichar_id)) {
1070  consistency_info->num_alphas++;
1071  } else if (unicharset.get_isdigit(unichar_id)) {
1072  consistency_info->num_digits++;
1073  } else if (!unicharset.get_ispunctuation(unichar_id)) {
1074  consistency_info->num_other++;
1075  }
1076 
1077  // Check font and spacing consistency.
1078  if (parent_b != NULL) {
1079  int fontinfo_id = -1;
1080  if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1081  parent_b->fontinfo_id2() == b->fontinfo_id()) {
1082  fontinfo_id = b->fontinfo_id();
1083  } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1084  parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1085  fontinfo_id = b->fontinfo_id2();
1086  }
1087  if(language_model_debug_level > 1) {
1088  tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1089  (parent_b->fontinfo_id() >= 0) ?
1090  fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
1091  (parent_b->fontinfo_id2() >= 0) ?
1092  fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
1093  (b->fontinfo_id() >= 0) ?
1094  fontinfo_table_->get(b->fontinfo_id()).name : "",
1095  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1096  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1097  fontinfo_id);
1098  }
1099  bool expected_gap_found = false;
1100  float expected_gap;
1101  int temp_gap;
1102  if (fontinfo_id >= 0) { // found a common font
1103  if (fontinfo_table_->get(fontinfo_id).get_spacing(
1104  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1105  expected_gap = temp_gap;
1106  expected_gap_found = true;
1107  }
1108  } else {
1109  consistency_info->inconsistent_font = true;
1110  // Get an average of the expected gaps in each font
1111  int num_addends = 0;
1112  expected_gap = 0;
1113  int temp_fid;
1114  for (int i = 0; i < 4; ++i) {
1115  if (i == 0) {
1116  temp_fid = parent_b->fontinfo_id();
1117  } else if (i == 1) {
1118  temp_fid = parent_b->fontinfo_id2();
1119  } else if (i == 2) {
1120  temp_fid = b->fontinfo_id();
1121  } else {
1122  temp_fid = b->fontinfo_id2();
1123  }
1124  if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
1125  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1126  expected_gap += temp_gap;
1127  num_addends++;
1128  }
1129  }
1130  expected_gap_found = (num_addends > 0);
1131  if (num_addends > 0) expected_gap /= static_cast<float>(num_addends);
1132  }
1133  if (expected_gap_found) {
1134  float actual_gap =
1135  static_cast<float>(AssociateUtils::GetChunksGap(
1136  chunks_record->chunk_widths, curr_col-1));
1137  float gap_ratio = expected_gap / actual_gap;
1138  // TODO(daria): find a good way to tune this heuristic estimate.
1139  if (gap_ratio < 1/2 || gap_ratio > 2) {
1140  consistency_info->num_inconsistent_spaces++;
1141  }
1142  if(language_model_debug_level > 1) {
1143  tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
1144  unicharset.id_to_unichar(parent_b->unichar_id()),
1145  parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
1146  unichar_id, curr_col, expected_gap, actual_gap);
1147  }
1148  }
1149  }
1150 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:620
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
int common_sid() const
Definition: unicharset.h:753
inT16 fontinfo_id() const
Definition: ratngs.h:68
inT16 fontinfo_id2() const
Definition: ratngs.h:71
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:420
#define NULL
Definition: host.h:144
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:399
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
const UnicityTable< FontInfo > * fontinfo_table_
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:607
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:406
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:552
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:129
int null_sid() const
Definition: unicharset.h:752
static int GetChunksGap(WIDTH_RECORD *width_record, int last_chunk)
Definition: associate.h:132
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT64 NODE_REF
Definition: dawg.h:55
WIDTH_RECORD * chunk_widths
Definition: associate.h:57
int hiragana_sid() const
Definition: unicharset.h:758
int han_sid() const
Definition: unicharset.h:757
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:110
int katakana_sid() const
Definition: unicharset.h:759
LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo ( bool  word_end,
int  script_id,
int  curr_col,
int  curr_row,
const BLOB_CHOICE b,
const ViterbiStateEntry parent_vse,
LanguageModelFlagsType changed 
)
protected

Definition at line 740 of file language_model.cpp.

745  {
746  bool use_fixed_length_dawgs = !fixed_length_beginning_active_dawgs_->empty();
747 
748  // Initialize active_dawgs and constraints from parent_vse if it is not NULL,
749  // otherwise use beginning_active_dawgs_ and beginning_constraints_.
750  if (parent_vse == NULL ||
751  (use_fixed_length_dawgs && parent_vse->dawg_info == NULL)) {
754  dawg_args_->permuter = NO_PERM;
755  } else {
756  if (parent_vse->dawg_info == NULL) return NULL; // not a dict word path
757  dawg_args_->active_dawgs = parent_vse->dawg_info->active_dawgs;
758  dawg_args_->constraints = parent_vse->dawg_info->constraints;
759  dawg_args_->permuter = parent_vse->dawg_info->permuter;
760  }
761 
762  // Deal with hyphenated words.
763  if (!use_fixed_length_dawgs && word_end &&
764  dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
765  if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
766  *changed |= kDawgFlag;
767  return new LanguageModelDawgInfo(dawg_args_->active_dawgs,
769  COMPOUND_PERM);
770  }
771 
772  // Deal with compound words.
773  if (!use_fixed_length_dawgs && dict_->compound_marker(b.unichar_id()) &&
774  (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
775  if (language_model_debug_level > 0) tprintf("Found compound marker");
776  // Do not allow compound operators at the beginning and end of the word.
777  // Do not allow more than one compound operator per word.
778  // Do not allow compounding of words with lengths shorter than
779  // language_model_min_compound_length
780  if (parent_vse == NULL || word_end ||
781  dawg_args_->permuter == COMPOUND_PERM ||
782  parent_vse->length < language_model_min_compound_length) return NULL;
783 
784  int i;
785  // Check a that the path terminated before the current character is a word.
786  bool has_word_ending = false;
787  for (i = 0; i < parent_vse->dawg_info->active_dawgs->size(); ++i) {
788  const DawgInfo &info = (*parent_vse->dawg_info->active_dawgs)[i];
789  const Dawg *pdawg = dict_->GetDawg(info.dawg_index);
790  assert(pdawg != NULL);
791  if (pdawg->type() == DAWG_TYPE_WORD && info.ref != NO_EDGE &&
792  pdawg->end_of_word(info.ref)) {
793  has_word_ending = true;
794  break;
795  }
796  }
797  if (!has_word_ending) return NULL;
798 
799  // Return LanguageModelDawgInfo with active_dawgs set to
800  // the beginning dawgs of type DAWG_TYPE_WORD.
801  if (language_model_debug_level > 0) tprintf("Compound word found\n");
802  DawgInfoVector beginning_word_dawgs;
803  for (i = 0; i < beginning_active_dawgs_->size(); ++i) {
804  const Dawg *bdawg =
805  dict_->GetDawg((*beginning_active_dawgs_)[i].dawg_index);
806  if (bdawg->type() == DAWG_TYPE_WORD) {
807  beginning_word_dawgs += (*beginning_active_dawgs_)[i];
808  }
809  }
810  *changed |= kDawgFlag;
811  return new LanguageModelDawgInfo(&(beginning_word_dawgs),
813  COMPOUND_PERM);
814  } // done dealing with compound words
815 
816  LanguageModelDawgInfo *dawg_info = NULL;
817 
818  // Call LetterIsOkay().
819  dict_->LetterIsOkay(dawg_args_, b.unichar_id(), word_end);
820  if (dawg_args_->permuter != NO_PERM) {
821  *changed |= kDawgFlag;
822  dawg_info = new LanguageModelDawgInfo(dawg_args_->updated_active_dawgs,
825  }
826 
827  // For non-space delimited languages: since every letter could be
828  // a valid word, a new word could start at every unichar. Thus append
829  // fixed_length_beginning_active_dawgs_ to dawg_info->active_dawgs.
830  if (use_fixed_length_dawgs) {
831  if (dawg_info == NULL) {
832  *changed |= kDawgFlag;
833  dawg_info = new LanguageModelDawgInfo(
835  empty_dawg_info_vec_, SYSTEM_DAWG_PERM);
836  } else {
837  *(dawg_info->active_dawgs) += *(fixed_length_beginning_active_dawgs_);
838  }
839  } // done dealing with fixed-length dawgs
840 
841  return dawg_info;
842 }
DawgInfoVector * beginning_active_dawgs_
int LetterIsOkay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:563
#define NULL
Definition: host.h:144
DawgInfoVector * updated_constraints
Definition: dict.h:80
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:605
DawgInfoVector * empty_dawg_info_vec_
bool empty() const
Definition: genericvector.h:68
DawgInfoVector * beginning_constraints_
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:144
int size() const
Definition: genericvector.h:59
DawgInfoVector * updated_active_dawgs
Definition: dict.h:79
DawgInfoVector * active_dawgs
Definition: dict.h:77
DawgInfoVector * constraints
Definition: dict.h:78
PermuterType permuter
Definition: dict.h:81
static const LanguageModelFlagsType kDawgFlag
DawgInfoVector * fixed_length_beginning_active_dawgs_
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:110
LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo ( const char *  unichar,
float  certainty,
float  denom,
int  curr_col,
int  curr_row,
const ViterbiStateEntry parent_vse,
BLOB_CHOICE parent_b,
LanguageModelFlagsType changed 
)
protected

Definition at line 844 of file language_model.cpp.

849  {
850  // Initialize parent context.
851  const char *pcontext_ptr = "";
852  int pcontext_unichar_step_len = 0;
853  if (parent_vse == NULL) {
854  pcontext_ptr = prev_word_str_.string();
855  pcontext_unichar_step_len = prev_word_unichar_step_len_;
856  } else {
857  pcontext_ptr = parent_vse->ngram_info->context.string();
858  pcontext_unichar_step_len =
859  parent_vse->ngram_info->context_unichar_step_len;
860  }
861  // Compute p(unichar | parent context).
862  int unichar_step_len = 0;
863  bool pruned = false;
864  float ngram_prob;
865  float ngram_cost = ComputeNgramCost(unichar, certainty, denom,
866  pcontext_ptr, &unichar_step_len,
867  &pruned, &ngram_prob);
868  // First attempt to normalize ngram_cost for strings of different
869  // lengths - we multiply ngram_cost by P(char | context) as many times
870  // as the number of chunks occupied by char. This makes the ngram costs
871  // of all the paths ending at the current BLOB_CHOICE comparable.
872  // TODO(daria): it would be worth looking at different ways of normalization.
873  if (curr_row > curr_col) {
874  ngram_cost += (curr_row - curr_col) * ngram_cost;
875  ngram_prob += (curr_row - curr_col) * ngram_prob;
876  }
877  // Add the ngram_cost of the parent.
878  if (parent_vse != NULL) {
879  ngram_cost += parent_vse->ngram_info->ngram_cost;
880  ngram_prob += parent_vse->ngram_info->ngram_prob;
881  }
882 
883  // Shorten parent context string by unichar_step_len unichars.
884  int num_remove = (unichar_step_len + pcontext_unichar_step_len -
886  if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
887  while (num_remove > 0 && *pcontext_ptr != '\0') {
888  pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
889  --num_remove;
890  }
891 
892  // Decide whether to prune this ngram path and update changed accordingly.
893  if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;
894  if (!pruned) *changed |= kNgramFlag;
895 
896  // Construct and return the new LanguageModelNgramInfo.
897  LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo(
898  pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_prob, ngram_cost);
899  ngram_info->context += unichar;
900  ngram_info->context_unichar_step_len += unichar_step_len;
901  assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
902  return ngram_info;
903 }
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:131
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:156
static const LanguageModelFlagsType kNgramFlag
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
void tesseract::LanguageModel::GenerateNgramModelPainPointsFromColumn ( int  col,
int  row,
HEAP pain_points,
CHUNKS_RECORD chunks_record 
)

Definition at line 1604 of file language_model.cpp.

1605  {
1606  // Find the first top choice path recorded for this cell.
1607  // If this path is pruned - generate a pain point.
1608  ASSERT_HOST(chunks_record->ratings->get(col, row) != NULL);
1609  BLOB_CHOICE_IT bit(chunks_record->ratings->get(col, row));
1610  bool fragmented = false;
1611  for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) {
1612  if (dict_->getUnicharset().get_fragment(bit.data()->unichar_id())) {
1613  fragmented = true;
1614  continue;
1615  }
1616  LanguageModelState *lms = reinterpret_cast<LanguageModelState *>(
1617  bit.data()->language_model_state());
1618  if (lms == NULL) continue;
1619  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
1620  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
1621  const ViterbiStateEntry *vse = vit.data();
1622  if (!vse->top_choice_flags) continue;
1623  ASSERT_HOST(vse->ngram_info != NULL);
1624  if (vse->ngram_info->pruned && (vse->parent_vse == NULL ||
1625  !vse->parent_vse->ngram_info->pruned)) {
1626  if (vse->parent_vse != NULL) {
1627  LanguageModelState *pp_lms = reinterpret_cast<LanguageModelState *>(
1628  vse->parent_b->language_model_state());
1629  GeneratePainPoint(pp_lms->contained_in_col, row, false,
1631  -1.0f, fragmented, -1.0f,
1633  vse->parent_vse->parent_b,
1634  vse->parent_vse->parent_vse,
1635  chunks_record, pain_points);
1636  }
1637  if (vse->parent_vse != NULL &&
1638  vse->parent_vse->parent_vse != NULL &&
1640  vse->parent_b->unichar_id())) {
1641  // If the dip in the ngram probability is due to punctuation in the
1642  // middle of the word - go two unichars back to combine this
1643  // punctuation mark with the previous character on the path.
1644  LanguageModelState *pp_lms = reinterpret_cast<LanguageModelState *>(
1645  vse->parent_vse->parent_b->language_model_state());
1646  GeneratePainPoint(pp_lms->contained_in_col, col-1, false,
1648  -1.0f, fragmented, -1.0f,
1650  vse->parent_vse->parent_vse->parent_b,
1651  vse->parent_vse->parent_vse->parent_vse,
1652  chunks_record, pain_points);
1653  } else if (row+1 < chunks_record->ratings->dimension()) {
1654  GeneratePainPoint(col, row+1, true,
1656  -1.0f, fragmented, -1.0f,
1658  vse->parent_b,
1659  vse->parent_vse,
1660  chunks_record, pain_points);
1661  }
1662  }
1663  return; // examined the lowest cost top choice path
1664  }
1665  }
1666 }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:420
static const float kDefaultPainPointPriorityAdjustment
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T get(int column, int row) const
Definition: matrix.h:117
#define f(xc, yc)
Definition: imgscale.cpp:39
int dimension() const
Definition: matrix.h:190
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
bool GeneratePainPoint(int col, int row, bool ok_to_extend, float priority_adjustment, float worst_piece_cert, bool fragmented, float best_choice_cert, float max_char_wh_ratio, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, HEAP *pain_points)
MATRIX * ratings
Definition: associate.h:52
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool tesseract::LanguageModel::GeneratePainPoint ( int  col,
int  row,
bool  ok_to_extend,
float  priority_adjustment,
float  worst_piece_cert,
bool  fragmented,
float  best_choice_cert,
float  max_char_wh_ratio,
BLOB_CHOICE parent_b,
ViterbiStateEntry parent_vse,
CHUNKS_RECORD chunks_record,
HEAP pain_points 
)

Definition at line 1923 of file language_model.cpp.

1928  {
1929  if (col < 0 || row >= chunks_record->ratings->dimension() ||
1930  chunks_record->ratings->get(col, row) != NOT_CLASSIFIED) {
1931  return false;
1932  }
1933  if (language_model_debug_level > 3) {
1934  tprintf("\nGenerating pain point for col=%d row=%d priority=%g parent=",
1935  col, row, priority);
1936  if (parent_vse != NULL) {
1937  PrintViterbiStateEntry("", parent_vse, parent_b, chunks_record);
1938  } else {
1939  tprintf("NULL");
1940  }
1941  tprintf("\n");
1942  }
1943 
1944  AssociateStats associate_stats;
1945  ComputeAssociateStats(col, row, max_char_wh_ratio, parent_vse,
1946  chunks_record, &associate_stats);
1947  // For fixed-pitch fonts/languages: if the current combined blob overlaps
1948  // the next blob on the right and it is ok to extend the blob, try expending
1949  // the blob until there is no overlap with the next blob on the right or
1950  // until the width-to-height ratio becomes too large.
1951  if (ok_to_extend) {
1952  while (associate_stats.bad_fixed_pitch_right_gap &&
1953  row+1 < chunks_record->ratings->dimension() &&
1954  !associate_stats.bad_fixed_pitch_wh_ratio) {
1955  ComputeAssociateStats(col, ++row, max_char_wh_ratio, parent_vse,
1956  chunks_record, &associate_stats);
1957  }
1958  }
1959 
1960  if (associate_stats.bad_shape) {
1961  if (language_model_debug_level > 3) {
1962  tprintf("Discarded pain point with a bad shape\n");
1963  }
1964  return false;
1965  }
1966 
1967  // Compute pain point priority.
1968  if (associate_stats.shape_cost > 0) {
1969  priority *= associate_stats.shape_cost;
1970  }
1971  if (worst_piece_cert < best_choice_cert) {
1972  worst_piece_cert = best_choice_cert;
1973  }
1974  priority *= CertaintyScore(worst_piece_cert);
1975  if (fragmented) priority /= kDefaultPainPointPriorityAdjustment;
1976 
1977  if (language_model_debug_level > 3) {
1978  tprintf("worst_piece_cert=%g fragmented=%d\n",
1979  worst_piece_cert, fragmented);
1980  }
1981 
1982  if (parent_vse != NULL) {
1983  priority *= sqrt(parent_vse->cost / static_cast<float>(col));
1984  if (parent_vse->dawg_info != NULL) {
1986  if (parent_vse->length > language_model_min_compound_length) {
1987  priority /= sqrt(static_cast<double>(parent_vse->length));
1988  }
1989  }
1990  }
1991 
1992  MATRIX_COORD *pain_point = new MATRIX_COORD(col, row);
1993  if (HeapPushCheckSize(pain_points, priority, pain_point)) {
1995  tprintf("Added pain point with priority %g\n", priority);
1996  }
1997  return true;
1998  } else {
1999  delete pain_point;
2000  if (language_model_debug_level) tprintf("Pain points heap is full\n");
2001  return false;
2002  }
2003 }
float CertaintyScore(float cert)
static const float kDefaultPainPointPriorityAdjustment
#define NULL
Definition: host.h:144
T get(int column, int row) const
Definition: matrix.h:117
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, AssociateStats *associate_stats)
int dimension() const
Definition: matrix.h:190
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool HeapPushCheckSize(HEAP *Heap, FLOAT32 Key, void *Data)
Definition: oldheap.cpp:170
#define NOT_CLASSIFIED
Definition: matrix.h:31
MATRIX * ratings
Definition: associate.h:52
void PrintViterbiStateEntry(const char *msg, ViterbiStateEntry *vse, BLOB_CHOICE *b, CHUNKS_RECORD *chunks_record)
void tesseract::LanguageModel::GeneratePainPointsFromBestChoice ( HEAP pain_points,
CHUNKS_RECORD chunks_record,
BestChoiceBundle best_choice_bundle 
)

Definition at line 1752 of file language_model.cpp.

1755  {
1756  // Variables to backtrack best_vse path;
1757  ViterbiStateEntry *curr_vse = best_choice_bundle->best_vse;
1758  BLOB_CHOICE *curr_b = best_choice_bundle->best_b;
1759 
1760  // Begins and ends in DANGERR vector record the positions in the blob choice
1761  // list of the best choice. We need to translate these endpoints into the
1762  // beginning column and ending row for the pain points. We maintain
1763  // danger_begin and danger_end arrays indexed by the position in
1764  // best_choice_bundle->best_char_choices (which is equal to the position
1765  // on the best_choice_bundle->best_vse path).
1766  // danger_end[d] stores the DANGERR_INFO structs with end==d and is
1767  // initialized at the beginning of this function.
1768  // danger_begin[d] stores the DANGERR_INFO struct with begin==d and
1769  // has end set to the row of the end of this ambiguity.
1770  // The translation from end in terms of the best choice index to the end row
1771  // is done while following the parents of best_choice_bundle->best_vse.
1772  assert(best_choice_bundle->best_char_choices->length() ==
1773  best_choice_bundle->best_vse->length);
1774  DANGERR *danger_begin = NULL;
1775  DANGERR *danger_end = NULL;
1776  int d;
1777  if (!best_choice_bundle->fixpt.empty()) {
1778  danger_begin = new DANGERR[best_choice_bundle->best_vse->length];
1779  danger_end = new DANGERR[best_choice_bundle->best_vse->length];
1780  for (d = 0; d < best_choice_bundle->fixpt.size(); ++d) {
1781  const DANGERR_INFO &danger = best_choice_bundle->fixpt[d];
1782  // Only use n->1 ambiguities.
1783  if (danger.end > danger.begin && !danger.correct_is_ngram &&
1784  (!language_model_ngram_on || danger.dangerous)) {
1785  danger_end[danger.end].push_back(danger);
1786  }
1787  }
1788  }
1789 
1790  // Variables to keep track of punctuation/number streaks.
1791  int punc_streak_end_row = -1;
1792  int punc_streak_length = 0;
1793  float punc_streak_min_cert = 0.0f;
1794 
1795  if (language_model_debug_level > 0) {
1796  tprintf("\nGenerating pain points for best path=%p\n", curr_vse);
1797  }
1798 
1799  int word_index = best_choice_bundle->best_vse->length;
1800  while (curr_vse != NULL) {
1801  word_index--;
1802  ASSERT_HOST(word_index >= 0);
1803  ASSERT_HOST(curr_b != NULL);
1804  if (language_model_debug_level > 0) {
1805  tprintf("Looking at unichar %s\n",
1807  }
1808 
1809  int pp_col = reinterpret_cast<LanguageModelState *>(
1810  curr_b->language_model_state())->contained_in_col;
1811  int pp_row = reinterpret_cast<LanguageModelState *>(
1812  curr_b->language_model_state())->contained_in_row;
1813 
1814  // Generate pain points for ambiguities found by NoDangerousAmbig().
1815  if (danger_end != NULL) {
1816  // Translate end index of an ambiguity to an end row.
1817  for (d = 0; d < danger_end[word_index].size(); ++d) {
1818  danger_end[word_index][d].end = pp_row;
1819  danger_begin[danger_end[word_index][d].begin].push_back(
1820  danger_end[word_index][d]);
1821  }
1822  // Generate a pain point to combine unchars in the "wrong" part
1823  // of the ambiguity.
1824  for (d = 0; d < danger_begin[word_index].size(); ++d) {
1825  if (language_model_debug_level > 0) {
1826  tprintf("Generating pain point from %sambiguity\n",
1827  danger_begin[word_index][d].dangerous ? "dangerous " : "");
1828  }
1829  GeneratePainPoint(pp_col, danger_begin[word_index][d].end, false,
1830  danger_begin[word_index][d].dangerous ?
1833  best_choice_bundle->best_choice->certainty(), true,
1834  best_choice_bundle->best_choice->certainty(),
1836  curr_vse->parent_b, curr_vse->parent_vse,
1837  chunks_record, pain_points);
1838  }
1839  }
1840 
1841  if (!language_model_ngram_on) { // no need to use further heuristics if we
1842  // are guided by the character ngram model
1843  // Generate pain points for problematic sub-paths.
1844  if (ProblematicPath(*curr_vse, curr_b->unichar_id(),
1845  pp_row+1 == chunks_record->ratings->dimension())) {
1846  if (language_model_debug_level > 0) {
1847  tprintf("Generating pain point from a problematic sub-path\n");
1848  }
1849  float worst_piece_cert;
1850  bool fragment;
1851  if (pp_col > 0) {
1852  GetWorstPieceCertainty(pp_col-1, pp_row, chunks_record->ratings,
1853  &worst_piece_cert, &fragment);
1854  GeneratePainPoint(pp_col-1, pp_row, false,
1856  worst_piece_cert, true,
1857  best_choice_bundle->best_choice->certainty(),
1859  chunks_record, pain_points);
1860  }
1861  if (pp_row+1 < chunks_record->ratings->dimension()) {
1862  GetWorstPieceCertainty(pp_col, pp_row+1, chunks_record->ratings,
1863  &worst_piece_cert, &fragment);
1864  GeneratePainPoint(pp_col, pp_row+1, true,
1866  worst_piece_cert, true,
1867  best_choice_bundle->best_choice->certainty(),
1869  chunks_record, pain_points);
1870  }
1871  }
1872 
1873  // Generate a pain point if we encountered a punctuation/number streak to
1874  // combine all punctuation marks into one blob and try to classify it.
1875  bool is_alpha = dict_->getUnicharset().get_isalpha(curr_b->unichar_id());
1876  if (!is_alpha) {
1877  if (punc_streak_end_row == -1) punc_streak_end_row = pp_row;
1878  punc_streak_length++;
1879  if (curr_b->certainty() < punc_streak_min_cert)
1880  punc_streak_min_cert = curr_b->certainty();
1881  }
1882  if (is_alpha || curr_vse->parent_vse == NULL) {
1883  if (punc_streak_end_row != -1 && punc_streak_length > 1) {
1884  if (language_model_debug_level > 0) {
1885  tprintf("Generating pain point from a punctuation streak\n");
1886  }
1887  if (is_alpha ||
1888  (curr_vse->parent_vse == NULL && punc_streak_length > 2)) {
1889  GeneratePainPoint(pp_row+1, punc_streak_end_row, false,
1891  punc_streak_min_cert, true,
1892  best_choice_bundle->best_choice->certainty(),
1893  max_char_wh_ratio_, curr_b, curr_vse,
1894  chunks_record, pain_points);
1895  }
1896  // Handle punctuation/number streaks starting from the first unichar.
1897  if (curr_vse->parent_vse == NULL) {
1898  GeneratePainPoint(0, punc_streak_end_row, false,
1900  punc_streak_min_cert, true,
1901  best_choice_bundle->best_choice->certainty(),
1903  chunks_record, pain_points);
1904  }
1905  }
1906  punc_streak_end_row = -1;
1907  punc_streak_length = 0;
1908  punc_streak_min_cert = 0.0f;
1909  } // end handling punctuation streaks
1910  }
1911 
1912  curr_b = curr_vse->parent_b;
1913  curr_vse = curr_vse->parent_vse;
1914  } // end looking at best choice subpaths
1915 
1916  // Clean up.
1917  if (danger_end != NULL) {
1918  delete[] danger_begin;
1919  delete[] danger_end;
1920  }
1921 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
bool dangerous
Definition: stopper.h:42
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
int push_back(T object)
static const float kBestChoicePainPointPriorityAdjustment
bool ProblematicPath(const ViterbiStateEntry &vse, UNICHAR_ID unichar_id, bool word_end)
float certainty() const
Definition: ratngs.h:65
int begin
Definition: stopper.h:40
int dimension() const
Definition: matrix.h:190
static const float kCriticalPainPointPriorityAdjustment
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool correct_is_ngram
Definition: stopper.h:43
void * language_model_state()
Definition: ratngs.h:77
int size() const
Definition: genericvector.h:59
void GetWorstPieceCertainty(int col, int row, MATRIX *ratings, float *cert, bool *fragmented)
bool GeneratePainPoint(int col, int row, bool ok_to_extend, float priority_adjustment, float worst_piece_cert, bool fragmented, float best_choice_cert, float max_char_wh_ratio, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, HEAP *pain_points)
MATRIX * ratings
Definition: associate.h:52
#define ASSERT_HOST(x)
Definition: errcode.h:84
static const float kLooseMaxCharWhRatio
void tesseract::LanguageModel::GeneratePainPointsFromColumn ( int  col,
const GenericVector< int > &  non_empty_rows,
float  best_choice_cert,
HEAP pain_points,
BestPathByColumn best_path_by_column[],
CHUNKS_RECORD chunks_record 
)

Definition at line 1581 of file language_model.cpp.

1587  {
1588  for (int i = 0; i < non_empty_rows.length(); ++i) {
1589  int row = non_empty_rows[i];
1590  if (language_model_debug_level > 0) {
1591  tprintf("\nLooking for pain points in col=%d row=%d\n", col, row);
1592  }
1595  col, row, pain_points, chunks_record);
1596  } else {
1598  col, row, best_choice_cert, pain_points,
1599  best_path_by_column, chunks_record);
1600  }
1601  }
1602 }
void GenerateProblematicPathPainPointsFromColumn(int col, int row, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record)
void GenerateNgramModelPainPointsFromColumn(int col, int row, HEAP *pain_points, CHUNKS_RECORD *chunks_record)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int length() const
Definition: genericvector.h:63
void tesseract::LanguageModel::GenerateProblematicPathPainPointsFromColumn ( int  col,
int  row,
float  best_choice_cert,
HEAP pain_points,
BestPathByColumn best_path_by_column[],
CHUNKS_RECORD chunks_record 
)

Definition at line 1668 of file language_model.cpp.

1671  {
1672  MATRIX *ratings = chunks_record->ratings;
1673 
1674  // Get the best path from this matrix cell.
1675  BLOB_CHOICE_LIST *blist = ratings->get(col, row);
1676  ASSERT_HOST(blist != NULL);
1677  if (blist->empty()) return;
1678  BLOB_CHOICE_IT bit(blist);
1679  bool fragment = false;
1680  while (dict_->getUnicharset().get_fragment(bit.data()->unichar_id()) &&
1681  !bit.at_last()) { // skip fragments
1682  fragment = true;
1683  bit.forward();
1684  }
1685  if (bit.data()->language_model_state() == NULL) return;
1686  ViterbiStateEntry_IT vit(&(reinterpret_cast<LanguageModelState *>(
1687  bit.data()->language_model_state())->viterbi_state_entries));
1688  if (vit.empty()) return;
1689  ViterbiStateEntry *vse = vit.data();
1690  // Check whether this path is promising.
1691  bool path_is_promising = true;
1692  if (vse->parent_vse != NULL) {
1693  float potential_avg_cost =
1694  ((vse->parent_vse->cost + bit.data()->rating()*0.5f) /
1695  static_cast<float>(row+1));
1696  if (language_model_debug_level > 0) {
1697  tprintf("potential_avg_cost %g best cost %g\n",
1698  potential_avg_cost, (*best_path_by_column)[col].avg_cost);
1699  }
1700  if (potential_avg_cost >= (*best_path_by_column)[col].avg_cost) {
1701  path_is_promising = false;
1702  }
1703  }
1704  // Set best_parent_vse to the best parent recorded in best_path_by_column.
1705  ViterbiStateEntry *best_parent_vse = vse->parent_vse;
1706  BLOB_CHOICE *best_parent_b = vse->parent_b;
1707  if (col > 0 && (*best_path_by_column)[col-1].best_vse != NULL) {
1708  ASSERT_HOST((*best_path_by_column)[col-1].best_b != NULL);
1709  LanguageModelState *best_lms = reinterpret_cast<LanguageModelState *>(
1710  ((*best_path_by_column)[col-1].best_b)->language_model_state());
1711  if (best_lms->contained_in_row == col-1) {
1712  best_parent_vse = (*best_path_by_column)[col-1].best_vse;
1713  best_parent_b = (*best_path_by_column)[col-1].best_b;
1714  if (language_model_debug_level > 0) {
1715  tprintf("Setting best_parent_vse to %p\n", best_parent_vse);
1716  }
1717  }
1718  }
1719  // Check whether this entry terminates the best parent path
1720  // recorded in best_path_by_column.
1721  bool best_not_prolonged = (best_parent_vse != vse->parent_vse);
1722 
1723  // If this path is problematic because of the last unichar - generate
1724  // a pain point to combine it with its left and right neighbor.
1725  BLOB_CHOICE_IT tmp_bit;
1726  if (best_not_prolonged ||
1727  (path_is_promising &&
1728  ProblematicPath(*vse, bit.data()->unichar_id(),
1729  row+1 == ratings->dimension()))) {
1730  float worst_piece_cert;
1731  bool fragmented;
1732  if (col-1 > 0) {
1733  GetWorstPieceCertainty(col-1, row, chunks_record->ratings,
1734  &worst_piece_cert, &fragmented);
1735  GeneratePainPoint(col-1, row, false,
1737  worst_piece_cert, fragmented, best_choice_cert,
1738  max_char_wh_ratio_, best_parent_b, best_parent_vse,
1739  chunks_record, pain_points);
1740  }
1741  if (row+1 < ratings->dimension()) {
1742  GetWorstPieceCertainty(col, row+1, chunks_record->ratings,
1743  &worst_piece_cert, &fragmented);
1745  worst_piece_cert, fragmented, best_choice_cert,
1746  max_char_wh_ratio_, best_parent_b, best_parent_vse,
1747  chunks_record, pain_points);
1748  }
1749  } // for ProblematicPath
1750 }
static const float kDefaultPainPointPriorityAdjustment
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T get(int column, int row) const
Definition: matrix.h:117
bool ProblematicPath(const ViterbiStateEntry &vse, UNICHAR_ID unichar_id, bool word_end)
int dimension() const
Definition: matrix.h:190
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
Definition: matrix.h:193
void GetWorstPieceCertainty(int col, int row, MATRIX *ratings, float *cert, bool *fragmented)
bool GeneratePainPoint(int col, int row, bool ok_to_extend, float priority_adjustment, float worst_piece_cert, bool fragmented, float best_choice_cert, float max_char_wh_ratio, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, HEAP *pain_points)
MATRIX * ratings
Definition: associate.h:52
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::LanguageModel::GenerateTopChoiceInfo ( float  ratings_sum,
const LanguageModelDawgInfo dawg_info,
const LanguageModelConsistencyInfo consistency_info,
const ViterbiStateEntry parent_vse,
BLOB_CHOICE b,
LanguageModelFlagsType top_choice_flags,
LanguageModelFlagsType changed 
)
protected

Definition at line 705 of file language_model.cpp.

712  {
714  ratings_sum, dawg_info, consistency_info);
715  // Clear flags that do not agree with parent_vse->top_choice_flags.
716  if (parent_vse != NULL) *top_choice_flags &= parent_vse->top_choice_flags;
717  if (consistency_info.Consistent()) *top_choice_flags |= kConsistentFlag;
718  if (*top_choice_flags == 0x0) return;
719  LanguageModelState *lms =
720  reinterpret_cast<LanguageModelState *>(b->language_model_state());
721  if (lms != NULL && !lms->viterbi_state_entries.empty()) {
722  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
723  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
724  if (ratings_sum >= ComputeConsistencyAdjustedRatingsSum(
725  vit.data()->ratings_sum, vit.data()->dawg_info,
726  vit.data()->consistency_info)) {
727  // Clear the appropriate flags if the list already contains
728  // a top choice entry with a lower cost.
729  *top_choice_flags &= ~(vit.data()->top_choice_flags);
730  }
731  }
732  }
733  if (language_model_debug_level > 0) {
734  tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
735  *top_choice_flags);
736  }
737  *changed |= *top_choice_flags;
738 }
static const LanguageModelFlagsType kConsistentFlag
#define NULL
Definition: host.h:144
float ComputeConsistencyAdjustedRatingsSum(float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void * language_model_state()
Definition: ratngs.h:77
void tesseract::LanguageModel::GetPieceCertainty ( BLOB_CHOICE_LIST *  blist,
float *  cert,
bool *  fragmented 
)
inlineprotected

Definition at line 472 of file language_model.h.

473  {
474  if (blist == NOT_CLASSIFIED || blist->empty()) return;
475  BLOB_CHOICE_IT bit(blist);
476  while (!bit.at_last() && IsFragment(bit.data())) {
477  *fragmented = true;
478  bit.forward(); // skip fragments
479  }
480  // Each classification must have at least one non-fragmented choice.
481  ASSERT_HOST(!IsFragment(bit.data()));
482  if (bit.data()->certainty() < *cert) *cert = bit.data()->certainty();
483  }
bool IsFragment(BLOB_CHOICE *b)
#define NOT_CLASSIFIED
Definition: matrix.h:31
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::LanguageModel::GetTopChoiceLowerUpper ( LanguageModelFlagsType  changed,
BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE **  first_lower,
BLOB_CHOICE **  first_upper 
)
protected

Definition at line 426 of file language_model.cpp.

429  {
430  if (!(changed & kLowerCaseFlag || changed & kUpperCaseFlag)) return;
431  BLOB_CHOICE_IT c_it(curr_list);
432  const UNICHARSET &unicharset = dict_->getUnicharset();
433  BLOB_CHOICE *first_unichar = NULL;
434  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
435  UNICHAR_ID unichar_id = c_it.data()->unichar_id();
436  if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
437  if (first_unichar == NULL) first_unichar = c_it.data();
438  if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {
439  *first_lower = c_it.data();
440  }
441  if (*first_upper == NULL && unicharset.get_isupper(unichar_id)) {
442  *first_upper = c_it.data();
443  }
444  }
445  ASSERT_HOST(first_unichar != NULL);
446  if (*first_lower == NULL) *first_lower = first_unichar;
447  if (*first_upper == NULL) *first_upper = first_unichar;
448 }
int UNICHAR_ID
Definition: unichar.h:31
#define NULL
Definition: host.h:144
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:399
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:406
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
static const LanguageModelFlagsType kLowerCaseFlag
static const LanguageModelFlagsType kUpperCaseFlag
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::LanguageModel::GetWorstPieceCertainty ( int  col,
int  row,
MATRIX ratings,
float *  cert,
bool *  fragmented 
)
inline

Definition at line 413 of file language_model.h.

414  {
415  *cert = 0.0f;
416  *fragmented = false;
417  if (row > 0) {
418  GetPieceCertainty(ratings->get(col, row-1), cert, fragmented);
419  }
420  if (col+1 < ratings->dimension()) {
421  GetPieceCertainty(ratings->get(col+1, row), cert, fragmented);
422  }
423  ASSERT_HOST(*cert < 0.0f);
424  }
T get(int column, int row) const
Definition: matrix.h:117
#define f(xc, yc)
Definition: imgscale.cpp:39
int dimension() const
Definition: matrix.h:190
void GetPieceCertainty(BLOB_CHOICE_LIST *blist, float *cert, bool *fragmented)
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::LanguageModel::InitForWord ( const WERD_CHOICE prev_word,
bool  fixed_pitch,
float  best_choice_cert,
float  max_char_wh_ratio,
float  rating_cert_scale,
HEAP pain_points,
CHUNKS_RECORD chunks_record,
BlamerBundle blamer_bundle,
bool  debug_blamer 
)

Definition at line 141 of file language_model.cpp.

145  {
146  fixed_pitch_ = fixed_pitch;
147  max_char_wh_ratio_ = max_char_wh_ratio;
148  rating_cert_scale_ = rating_cert_scale;
149  acceptable_choice_found_ = false;
151 
152  // For each cell, generate a "pain point" if the cell is not classified
153  // and has a left or right neighbor that was classified.
154  MATRIX *ratings = chunks_record->ratings;
155  for (int col = 0; col < ratings->dimension(); ++col) {
156  for (int row = col+1; row < ratings->dimension(); ++row) {
157  if ((row > 0 && ratings->get(col, row-1) != NOT_CLASSIFIED) ||
158  (col+1 < ratings->dimension() &&
159  ratings->get(col+1, row) != NOT_CLASSIFIED)) {
160  float worst_piece_cert;
161  bool fragmented;
162  GetWorstPieceCertainty(col, row, chunks_record->ratings,
163  &worst_piece_cert, &fragmented);
165  worst_piece_cert, fragmented, best_choice_cert,
167  chunks_record, pain_points);
168  }
169  }
170  }
171 
172  // Initialize vectors with beginning DawgInfos.
174  dict_->init_active_dawgs(kAnyWordLength, beginning_active_dawgs_, false);
177  if (dict_->GetMaxFixedLengthDawgIndex() >= 0) {
179  for (int i = 0; i < beginning_active_dawgs_->size(); ++i) {
180  int dawg_index = (*beginning_active_dawgs_)[i].dawg_index;
181  if (dawg_index <= dict_->GetMaxFixedLengthDawgIndex() &&
182  dawg_index >= kMinFixedLengthDawgLength) {
183  *fixed_length_beginning_active_dawgs_ += (*beginning_active_dawgs_)[i];
184  }
185  }
186  }
187 
190 
191  // Fill prev_word_str_ with the last language_model_ngram_order
192  // unichars from prev_word.
194  if (prev_word != NULL && prev_word->unichar_string() != NULL) {
195  prev_word_str_ = prev_word->unichar_string();
197  } else {
198  prev_word_str_ += ' ';
199  }
200  const char *str_ptr = prev_word_str_.string();
201  const char *str_end = str_ptr + prev_word_str_.length();
202  int step;
204  while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
205  str_ptr += step;
207  }
208  ASSERT_HOST(str_ptr == str_end);
209  }
210 
211  // Initialize blamer-related information: map character boxes recorded in
212  // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
213  // ratings matrix. We expect this step to succeed, since when running the
214  // chopper we checked that the correct chops are present.
215  if (blamer_bundle != NULL &&
216  blamer_bundle->incorrect_result_reason == IRR_CORRECT &&
217  blamer_bundle->truth_has_char_boxes) {
218  STRING blamer_debug;
219  blamer_debug += "Blamer computing correct_segmentation_cols\n";
220  int curr_box_col = 0;
221  int next_box_col = 0;
222  TBLOB *blob = chunks_record->chunks;
223  inT16 next_box_x = (blob != NULL) ? blob->bounding_box().right() : 0;
224  for (int truth_idx = 0;
225  blob != NULL && truth_idx < blamer_bundle->norm_truth_word.length();
226  blob = blob->next) {
227  ++next_box_col;
228  inT16 curr_box_x = next_box_x;
229  if (blob->next != NULL) next_box_x = blob->next->bounding_box().right();
230  inT16 truth_x = blamer_bundle->norm_truth_word.BlobBox(truth_idx).right();
231  blamer_debug.add_str_int("Box x coord vs. truth: ", curr_box_x);
232  blamer_debug.add_str_int(" ", truth_x);
233  blamer_debug += "\n";
234  if (curr_box_x > (truth_x + blamer_bundle->norm_box_tolerance)) {
235  break; // failed to find a matching box
236  } else if (curr_box_x >=
237  (truth_x - blamer_bundle->norm_box_tolerance) && // matched
238  (blob->next == NULL || // next box can't be included
239  next_box_x > truth_x + blamer_bundle->norm_box_tolerance)) {
240  blamer_bundle->correct_segmentation_cols.push_back(curr_box_col);
241  blamer_bundle->correct_segmentation_rows.push_back(next_box_col-1);
242  ++truth_idx;
243  blamer_debug.add_str_int("col=", curr_box_col);
244  blamer_debug.add_str_int(" row=", next_box_col-1);
245  blamer_debug += "\n";
246  curr_box_col = next_box_col;
247  }
248  }
249  if (blob != NULL || // trailing blobs
250  blamer_bundle->correct_segmentation_cols.length() !=
251  blamer_bundle->norm_truth_word.length()) {
252  blamer_debug.add_str_int("Blamer failed to find correct segmentation"
253  " (tolerance=", blamer_bundle->norm_box_tolerance);
254  if (blob == NULL) blamer_debug += " blob == NULL";
255  blamer_debug += ")\n";
256  blamer_debug.add_str_int(
257  " path length ", blamer_bundle->correct_segmentation_cols.length());
258  blamer_debug.add_str_int(" vs. truth ",
259  blamer_bundle->norm_truth_word.length());
260  blamer_debug += "\n";
261  blamer_bundle->SetBlame(IRR_UNKNOWN, blamer_debug, NULL, debug_blamer);
262  blamer_bundle->correct_segmentation_cols.clear();
263  blamer_bundle->correct_segmentation_rows.clear();
264  }
265  } // end if (blamer_bundle != NULL)
266 
267  // Start a new hypothesis list for this run of segmentation search.
268  if (blamer_bundle != NULL) {
270  }
271 }
const STRING & unichar_string() const
Definition: ratngs.h:395
const int length() const
Definition: boxword.h:99
IncorrectResultReason incorrect_result_reason
Definition: pageres.h:176
TBLOB * chunks
Definition: associate.h:53
GenericVector< int > correct_segmentation_cols
Definition: pageres.h:190
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:131
virtual void clear()
DawgInfoVector * beginning_active_dawgs_
inT32 length() const
Definition: strngs.cpp:151
const int GetMaxFixedLengthDawgIndex() const
Definition: dict.h:616
#define NULL
Definition: host.h:144
T get(int column, int row) const
Definition: matrix.h:117
int push_back(T object)
bool language_model_ngram_space_delimited_language
inT16 right() const
Definition: rect.h:74
static const float kInitialPainPointPriorityAdjustment
void add_str_int(const char *str, int number)
Definition: strngs.cpp:334
void init_active_dawgs(int sought_word_length, DawgInfoVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:643
double segment_penalty_dict_nonword
Definition: dict.h:830
tesseract::BoxWord norm_truth_word
Definition: pageres.h:170
int dimension() const
Definition: matrix.h:190
Definition: blobs.h:174
const char * string() const
Definition: strngs.cpp:156
tesseract::ParamsTrainingBundle params_training_bundle
Definition: pageres.h:199
void SetBlame(IncorrectResultReason irr, const STRING &msg, const WERD_CHOICE *choice, bool debug)
Definition: pageres.h:151
DawgInfoVector * beginning_constraints_
void init_constraints(DawgInfoVector *constraints) const
Definition: dict.cpp:677
TBOX bounding_box() const
Definition: blobs.cpp:384
#define NOT_CLASSIFIED
Definition: matrix.h:31
Definition: strngs.h:40
int size() const
Definition: genericvector.h:59
short inT16
Definition: host.h:100
int length() const
Definition: genericvector.h:63
Definition: matrix.h:193
bool truth_has_char_boxes
Definition: pageres.h:164
const TBOX & BlobBox(int index) const
Definition: boxword.h:102
void GetWorstPieceCertainty(int col, int row, MATRIX *ratings, float *cert, bool *fragmented)
double segment_penalty_dict_case_ok
Definition: dict.h:818
bool GeneratePainPoint(int col, int row, bool ok_to_extend, float priority_adjustment, float worst_piece_cert, bool fragmented, float best_choice_cert, float max_char_wh_ratio, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, HEAP *pain_points)
static const int kMinFixedLengthDawgLength
MATRIX * ratings
Definition: associate.h:52
#define ASSERT_HOST(x)
Definition: errcode.h:84
GenericVector< int > correct_segmentation_rows
Definition: pageres.h:191
int norm_box_tolerance
Definition: pageres.h:172
DawgInfoVector * fixed_length_beginning_active_dawgs_
TBLOB * next
Definition: blobs.h:228
bool tesseract::LanguageModel::IsFragment ( BLOB_CHOICE b)
inlineprotected

Definition at line 459 of file language_model.h.

459  {
460  return dict_->getUnicharset().get_fragment(b->unichar_id());
461  }
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
bool tesseract::LanguageModel::IsHan ( int  script_id)
inlineprotected

Definition at line 463 of file language_model.h.

463  {
464  return ((dict_->getUnicharset().han_sid() !=
465  dict_->getUnicharset().null_sid()) &&
466  (script_id == dict_->getUnicharset().han_sid()));
467  }
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
int null_sid() const
Definition: unicharset.h:752
int han_sid() const
Definition: unicharset.h:757
bool tesseract::LanguageModel::NonAlphaOrDigitMiddle ( int  col,
int  row,
int  dimension,
UNICHAR_ID  unichar_id 
)
inlineprotected

Definition at line 452 of file language_model.h.

453  {
454  return (!dict_->getUnicharset().get_isalpha(unichar_id) &&
455  !dict_->getUnicharset().get_isdigit(unichar_id) &&
456  col > 0 && row+1 < dimension);
457  }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
void tesseract::LanguageModel::PrintViterbiStateEntry ( const char *  msg,
ViterbiStateEntry vse,
BLOB_CHOICE b,
CHUNKS_RECORD chunks_record 
)
protected

Definition at line 669 of file language_model.cpp.

671  {
672  tprintf("%s ViterbiStateEntry %p with ratings_sum=%.4f length=%d cost=%.4f",
673  msg, vse, vse->ratings_sum, vse->length, vse->cost);
674  if (vse->top_choice_flags) {
675  tprintf(" top_choice_flags=0x%x", vse->top_choice_flags);
676  }
677  if (!vse->Consistent()) {
678  tprintf(" inconsistent=(punc %d case %d chartype %d script %d)\n",
679  vse->consistency_info.NumInconsistentPunc(),
680  vse->consistency_info.NumInconsistentCase(),
681  vse->consistency_info.NumInconsistentChartype(),
682  vse->consistency_info.inconsistent_script);
683  }
684  if (vse->dawg_info) tprintf(" permuter=%d", vse->dawg_info->permuter);
685  if (vse->ngram_info) {
686  tprintf(" ngram_cost=%g context=%s ngram pruned=%d",
687  vse->ngram_info->ngram_cost,
688  vse->ngram_info->context.string(),
689  vse->ngram_info->pruned);
690  }
691  if (vse->associate_stats.shape_cost > 0.0f) {
692  tprintf(" shape_cost=%g", vse->associate_stats.shape_cost);
693  }
694  if (language_model_debug_level > 3) {
695  STRING wd_str;
696  WERD_CHOICE *wd = ConstructWord(b, vse, chunks_record,
697  NULL, NULL, NULL, NULL, NULL, NULL);
698  wd->string_and_lengths(&wd_str, NULL);
699  delete wd;
700  tprintf(" str=%s", wd_str.string());
701  }
702  tprintf("\n");
703 }
WERD_CHOICE * ConstructWord(BLOB_CHOICE *b, ViterbiStateEntry *vse, CHUNKS_RECORD *chunks_record, BLOB_CHOICE_LIST_VECTOR *best_char_choices, float certainties[], float *dawg_score, STATE *state, BlamerBundle *blamer_bundle, bool *truth_path)
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
Definition: strngs.h:40
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:294
bool tesseract::LanguageModel::ProblematicPath ( const ViterbiStateEntry vse,
UNICHAR_ID  unichar_id,
bool  word_end 
)
protected

Definition at line 381 of file language_model.cpp.

382  {
383  // The path is problematic if it is inconsistent and has a parent that
384  // is consistent (or a NULL parent).
385  if (!vse.Consistent() && (vse.parent_vse == NULL ||
386  vse.parent_vse->Consistent())) {
387  if (language_model_debug_level > 0) {
388  tprintf("ProblematicPath: inconsistent\n");
389  }
390  return true;
391  }
392  // The path is problematic if it does not represent a dictionary word,
393  // while its parent does.
394  if (vse.dawg_info == NULL &&
395  (vse.parent_vse == NULL || vse.parent_vse->dawg_info != NULL)) {
396  if (language_model_debug_level > 0) {
397  tprintf("ProblematicPath: dict word terminated\n");
398  }
399  return true;
400  }
401  // The path is problematic if ngram info indicates that this path is
402  // an unlikely sequence of character, while its parent is does not have
403  // extreme dips in ngram probabilities.
404  if (vse.ngram_info != NULL && vse.ngram_info->pruned &&
405  (vse.parent_vse == NULL || !vse.parent_vse->ngram_info->pruned)) {
406  if (language_model_debug_level > 0) {
407  tprintf("ProblematicPath: small ngram prob\n");
408  }
409  return true;
410  }
411  // The path is problematic if there is a non-alpha character in the
412  // middle of the path (unless it is a digit in the middle of a path
413  // that represents a number).
414  if ((vse.parent_vse != NULL) && !word_end && // is middle
415  !(dict_->getUnicharset().get_isalpha(unichar_id) || // alpha
416  (dict_->getUnicharset().get_isdigit(unichar_id) && // ok digit
417  vse.dawg_info != NULL && vse.dawg_info->permuter == NUMBER_PERM))) {
418  if (language_model_debug_level > 0) {
419  tprintf("ProblematicPath: non-alpha middle\n");
420  }
421  return true;
422  }
423  return false;
424 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool tesseract::LanguageModel::PrunablePath ( LanguageModelFlagsType  top_choice_flags,
const LanguageModelDawgInfo dawg_info 
)
inlineprotected

Definition at line 719 of file language_model.h.

720  {
721  if (top_choice_flags) return false;
722  if (dawg_info != NULL &&
723  (dawg_info->permuter == SYSTEM_DAWG_PERM ||
724  dawg_info->permuter == USER_DAWG_PERM ||
725  dawg_info->permuter == FREQ_DAWG_PERM) &&
726  dict_->GetMaxFixedLengthDawgIndex() < 0) return false;
727  return true;
728  }
const int GetMaxFixedLengthDawgIndex() const
Definition: dict.h:616
#define NULL
Definition: host.h:144
void tesseract::LanguageModel::UpdateBestChoice ( BLOB_CHOICE b,
ViterbiStateEntry vse,
HEAP pain_points,
CHUNKS_RECORD chunks_record,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 1184 of file language_model.cpp.

1190  {
1191  int i;
1192  BLOB_CHOICE_LIST_VECTOR temp_best_char_choices(vse->length);
1193  for (i = 0; i < vse->length; ++i) {
1194  temp_best_char_choices.push_back(NULL);
1195  }
1196  float *certainties = new float[vse->length];
1197  STATE temp_state;
1198  // The fraction of letters in the path that are "covered" by dawgs.
1199  // For space delimited languages this number will be 0.0 for nonwords
1200  // and 1.0 for dictionary words. For non-space delimited languages
1201  // this number will be in the [0.0, 1.0] range.
1202  float dawg_score;
1203  bool truth_path;
1204  WERD_CHOICE *word = ConstructWord(b, vse, chunks_record,
1205  &temp_best_char_choices, certainties,
1206  &dawg_score, &temp_state,
1207  blamer_bundle, &truth_path);
1208  ASSERT_HOST(word != NULL);
1209  bool not_blaming =
1210  (blamer_bundle == NULL || !blamer_bundle->segsearch_is_looking_for_blame);
1211 
1212  // Log new segmentation (for dict_->LogNewChoice()).
1213  if (not_blaming) {
1214  PIECES_STATE pieces_widths;
1215  bin_to_pieces(&temp_state, chunks_record->ratings->dimension() - 1,
1216  pieces_widths);
1217  dict_->LogNewSegmentation(pieces_widths);
1218  }
1219 
1220  if (language_model_debug_level > 0) {
1221  STRING word_str;
1222  word->string_and_lengths(&word_str, NULL);
1223  tprintf("UpdateBestChoice() constructed word %s\n", word_str.string());
1224  if (language_model_debug_level > 2) word->print();
1225  };
1226 
1227  // Update raw_choice if needed.
1228  if ((vse->top_choice_flags & kSmallestRatingFlag) &&
1229  word->rating() < best_choice_bundle->raw_choice->rating() &&
1230  not_blaming) {
1231  dict_->LogNewChoice(1.0, certainties, true, word, temp_best_char_choices);
1232  *(best_choice_bundle->raw_choice) = *word;
1233  best_choice_bundle->raw_choice->set_permuter(TOP_CHOICE_PERM);
1234  if (language_model_debug_level > 0) tprintf("Updated raw choice\n");
1235  }
1236 
1237  // When working with non-space delimited languages we re-adjust the cost
1238  // taking into account the final dawg_score and a more precise shape cost.
1239  // While constructing the paths we assume that they are all dictionary words
1240  // (since a single character would be a valid dictionary word). At the end
1241  // we compute dawg_score which reflects how many characters on the path are
1242  // "covered" by dictionary words of length > 1.
1243  if (vse->associate_stats.full_wh_ratio_var != 0.0f ||
1244  (dict_->GetMaxFixedLengthDawgIndex() >= 0 && dawg_score < 1.0f)) {
1245  vse->cost = ComputeAdjustedPathCost(
1246  vse->ratings_sum, vse->length, dawg_score, vse->dawg_info,
1247  vse->ngram_info, vse->consistency_info, vse->associate_stats,
1248  vse->parent_vse);
1249  if (language_model_debug_level > 0) {
1250  tprintf("Updated vse cost to %g (dawg_score %g full_wh_ratio_var %g)\n",
1251  vse->cost, dawg_score, vse->associate_stats.full_wh_ratio_var);
1252  }
1253  }
1254 
1255  // Update best choice and best char choices if needed.
1256  // TODO(daria): re-write AcceptableChoice() and NoDangerousAmbig()
1257  // to fit better into the new segmentation search.
1258  word->set_rating(vse->cost);
1259  if (word->rating() < best_choice_bundle->best_choice->rating() &&
1260  not_blaming) {
1262  vse->ngram_info->ngram_cost :
1263  vse->ratings_sum),
1264  certainties, false, word, temp_best_char_choices);
1265  // Since the rating of the word could have been modified by
1266  // Dict::LogNewChoice() - check again.
1267  if (word->rating() < best_choice_bundle->best_choice->rating()) {
1268  bool modified_blobs; // not used
1269  DANGERR fixpt;
1270  if (dict_->AcceptableChoice(&temp_best_char_choices, word, &fixpt,
1271  ASSOCIATOR_CALLER, &modified_blobs) &&
1272  AcceptablePath(*vse)) {
1273  acceptable_choice_found_ = true;
1274  }
1275  // Update best_choice_bundle.
1276  *(best_choice_bundle->best_choice) = *word;
1277  best_choice_bundle->updated = true;
1278  best_choice_bundle->best_char_choices->delete_data_pointers();
1279  best_choice_bundle->best_char_choices->clear();
1280  for (i = 0; i < temp_best_char_choices.size(); ++i) {
1281  BLOB_CHOICE_LIST *cc_list = new BLOB_CHOICE_LIST();
1282  cc_list->deep_copy(temp_best_char_choices[i], &BLOB_CHOICE::deep_copy);
1283  best_choice_bundle->best_char_choices->push_back(cc_list);
1284  }
1285  best_choice_bundle->best_state->part2 = temp_state.part2;
1286  best_choice_bundle->best_state->part1 = temp_state.part1;
1287  if (language_model_debug_level > 0) {
1288  tprintf("Updated best choice\n");
1289  print_state("New state ", best_choice_bundle->best_state,
1290  chunks_record->ratings->dimension()-1);
1291  }
1292  // Update hyphen state if we are dealing with a dictionary word.
1293  if (vse->dawg_info != NULL && dict_->GetMaxFixedLengthDawgIndex() < 0) {
1294  if (dict_->has_hyphen_end(*word)) {
1296  *(dawg_args_->constraints));
1297  } else {
1298  dict_->reset_hyphen_vars(true);
1299  }
1300  }
1301  best_choice_bundle->best_vse = vse;
1302  best_choice_bundle->best_b = b;
1303  best_choice_bundle->fixpt = fixpt;
1304 
1305  if (blamer_bundle != NULL) {
1306  blamer_bundle->best_choice_is_dict_and_top_choice =
1307  (vse->dawg_info != NULL &&
1309  (vse->top_choice_flags));
1310  }
1311  }
1312  }
1313  if (blamer_bundle != NULL) {
1314  // Record the current hypothesis in params_training_bundle.
1315  ParamsTrainingHypothesis &hyp =
1316  blamer_bundle->params_training_bundle.AddHypothesis();
1317  word->string_and_lengths(&(hyp.str), NULL);
1318  ExtractRawFeaturesFromPath(*vse, hyp.features);
1319  if (truth_path && // update best truth path rating
1320  word->rating() < blamer_bundle->best_correctly_segmented_rating) {
1321  blamer_bundle->best_correctly_segmented_rating = word->rating();
1322  }
1323  }
1324 
1325  // Clean up.
1326  delete[] certainties;
1327  delete word;
1328 }
bool AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices, WERD_CHOICE *BestChoice, DANGERR *fixpt, ACCEPTABLE_CHOICE_CALLER caller, bool *modified_blobs)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:191
void set_rating(float new_val)
Definition: ratngs.h:255
WERD_CHOICE * ConstructWord(BLOB_CHOICE *b, ViterbiStateEntry *vse, CHUNKS_RECORD *chunks_record, BLOB_CHOICE_LIST_VECTOR *best_char_choices, float certainties[], float *dawg_score, STATE *state, BlamerBundle *blamer_bundle, bool *truth_path)
bool best_choice_is_dict_and_top_choice
Definition: pageres.h:194
const int GetMaxFixedLengthDawgIndex() const
Definition: dict.h:616
float best_correctly_segmented_rating
Definition: pageres.h:187
#define NULL
Definition: host.h:144
uinT32 part1
Definition: states.h:41
void print_state(const char *label, STATE *state, int num_joints)
Definition: states.cpp:214
static const LanguageModelFlagsType kSmallestRatingFlag
void ExtractRawFeaturesFromPath(const ViterbiStateEntry &vse, float *features)
float ComputeAdjustedPathCost(float ratings_sum, int length, float dawg_score, const LanguageModelDawgInfo *dawg_info, const LanguageModelNgramInfo *ngram_info, const LanguageModelConsistencyInfo &consistency_info, const AssociateStats &associate_stats, ViterbiStateEntry *parent_vse)
int dimension() const
Definition: matrix.h:190
const char * string() const
Definition: strngs.cpp:156
bool AcceptablePath(const ViterbiStateEntry &vse)
const void print() const
Definition: ratngs.h:406
tesseract::ParamsTrainingBundle params_training_bundle
Definition: pageres.h:199
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:144
Definition: strngs.h:40
uinT32 part2
Definition: states.h:42
DawgInfoVector * active_dawgs
Definition: dict.h:77
DawgInfoVector * constraints
Definition: dict.h:78
bool segsearch_is_looking_for_blame
Definition: pageres.h:184
static BLOB_CHOICE * deep_copy(const BLOB_CHOICE *src)
Definition: ratngs.h:126
void LogNewSegmentation(PIECES_STATE BlobWidth)
Definition: stopper.cpp:463
void set_hyphen_word(const WERD_CHOICE &word, const DawgInfoVector &active_dawgs, const DawgInfoVector &constraints)
Definition: hyphen.cpp:50
uinT8 PIECES_STATE[MAX_NUM_CHUNKS+2]
Definition: states.h:49
MATRIX * ratings
Definition: associate.h:52
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: states.h:39
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:294
ParamsTrainingHypothesis & AddHypothesis()
void bin_to_pieces(STATE *state, int num_joints, PIECES_STATE pieces)
Definition: states.cpp:99
float rating() const
Definition: ratngs.h:231
void LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[], bool raw_choice, WERD_CHOICE *WordChoice, const BLOB_CHOICE_LIST_VECTOR &blob_choices)
Definition: stopper.cpp:484
void set_permuter(uinT8 perm)
Definition: ratngs.h:261
void tesseract::LanguageModel::UpdateCoveredByFixedLengthDawgs ( const DawgInfoVector active_dawgs,
int  word_index,
int  word_length,
int *  skip,
int *  covered,
float *  dawg_score,
bool *  dawg_score_done 
)
protected

Definition at line 1527 of file language_model.cpp.

1529  {
1530  if (language_model_debug_level > 3) {
1531  tprintf("UpdateCoveredByFixedLengthDawgs for index %d skip=%d\n",
1532  word_index, *skip, word_length);
1533  }
1534 
1535  if (*skip > 0) {
1536  --(*skip);
1537  } else {
1538  int best_index = -1;
1539  for (int d = 0; d < active_dawgs.size(); ++d) {
1540  int dawg_index = (active_dawgs[d]).dawg_index;
1541  if (dawg_index > dict_->GetMaxFixedLengthDawgIndex()) {
1542  // If active_dawgs of the last ViterbiStateEntry on the path
1543  // contain a non-fixed length dawg, this means that the whole
1544  // path represents a word from a non-fixed length word dawg.
1545  if (word_index == (word_length - 1)) {
1546  *dawg_score = 1.0f;
1547  *dawg_score_done = true;
1548  return;
1549  }
1550  } else if (dawg_index >= kMinFixedLengthDawgLength) {
1551  const Dawg *curr_dawg = dict_->GetDawg(dawg_index);
1552  ASSERT_HOST(curr_dawg != NULL);
1553  if ((active_dawgs[d]).ref != NO_EDGE &&
1554  curr_dawg->end_of_word((active_dawgs[d]).ref) &&
1555  dawg_index > best_index) {
1556  best_index = dawg_index;
1557  }
1558 
1559  if (language_model_debug_level > 3) {
1560  tprintf("dawg_index %d, ref %d, eow %d\n", dawg_index,
1561  (active_dawgs[d]).ref,
1562  ((active_dawgs[d]).ref != NO_EDGE &&
1563  curr_dawg->end_of_word((active_dawgs[d]).ref)));
1564  }
1565  }
1566  } // end for
1567  if (best_index != -1) {
1568  *skip = best_index - 1;
1569  *covered += best_index;
1570  }
1571  } // end if/else skip
1572 
1573  if (word_index == 0) {
1574  ASSERT_HOST(*covered <= word_length);
1575  *dawg_score = (static_cast<float>(*covered) /
1576  static_cast<float>(word_length));
1577  *dawg_score_done = true;
1578  }
1579 }
const int GetMaxFixedLengthDawgIndex() const
Definition: dict.h:616
#define NULL
Definition: host.h:144
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:605
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
static const int kMinFixedLengthDawgLength
#define ASSERT_HOST(x)
Definition: errcode.h:84
LanguageModelFlagsType tesseract::LanguageModel::UpdateState ( LanguageModelFlagsType  changed,
int  curr_col,
int  curr_row,
BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE_LIST *  parent_list,
HEAP pain_points,
BestPathByColumn best_path_by_column[],
CHUNKS_RECORD chunks_record,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Definition at line 292 of file language_model.cpp.

301  {
302  if (language_model_debug_level > 0) {
303  tprintf("\nUpdateState: col=%d row=%d (changed=0x%x parent=%p)\n",
304  curr_col, curr_row, changed, parent_list);
305  }
306  // Initialize helper variables.
307  bool word_end = (curr_row+1 >= chunks_record->ratings->dimension());
308  bool just_classified = (changed & kJustClassifiedFlag);
309  LanguageModelFlagsType new_changed = 0x0;
310  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
311 
312  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
313  ViterbiStateEntry_IT vit;
314  BLOB_CHOICE_IT c_it(curr_list);
315  int c_it_counter = 0;
316  bool first_iteration = true;
317  BLOB_CHOICE *first_lower = NULL;
318  BLOB_CHOICE *first_upper = NULL;
319  GetTopChoiceLowerUpper(changed, curr_list, &first_lower, &first_upper);
320  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
321  if (dict_->GetMaxFixedLengthDawgIndex() >= 0 &&
322  c_it_counter++ >= language_model_fixed_length_choices_depth) {
323  break;
324  }
325  // Skip NULL unichars unless it is the only choice.
326  if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
327  if (dict_->getUnicharset().get_fragment(c_it.data()->unichar_id())) {
328  continue; // skip fragments
329  }
330  // Set top choice flags.
331  LanguageModelFlagsType top_choice_flags = 0x0;
332  if (first_iteration && (changed | kSmallestRatingFlag)) {
333  top_choice_flags |= kSmallestRatingFlag;
334  }
335  if (first_lower == c_it.data()) top_choice_flags |= kLowerCaseFlag;
336  if (first_upper == c_it.data()) top_choice_flags |= kUpperCaseFlag;
337 
338  if (parent_list == NULL) { // process the beginning of a word
339  new_changed |= AddViterbiStateEntry(
340  top_choice_flags, denom, word_end, curr_col, curr_row,
341  c_it.data(), NULL, NULL, pain_points, best_path_by_column,
342  chunks_record, best_choice_bundle, blamer_bundle);
343  } else { // get viterbi entries from each of the parent BLOB_CHOICEs
344  BLOB_CHOICE_IT p_it(parent_list);
345  for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
346  LanguageModelState *parent_lms =
347  reinterpret_cast<LanguageModelState *>(
348  p_it.data()->language_model_state());
349  if (parent_lms == NULL || parent_lms->viterbi_state_entries.empty()) {
350  continue;
351  }
352  vit.set_to_list(&(parent_lms->viterbi_state_entries));
353  int vit_counter = 0;
354  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
355  // Skip pruned entries and do not look at prunable entries if already
356  // examined language_model_viterbi_list_max_num_prunable of those.
357  if (PrunablePath(vit.data()->top_choice_flags,
358  vit.data()->dawg_info) &&
360  (language_model_ngram_on && vit.data()->ngram_info->pruned))) {
361  continue;
362  }
363  // Only consider the parent if it has been updated or
364  // if the current ratings cell has just been classified.
365  if (!just_classified && !vit.data()->updated) continue;
366  // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
367  // looks good according to the Dawgs or character ngram model.
368  new_changed |= AddViterbiStateEntry(
369  top_choice_flags, denom, word_end, curr_col, curr_row,
370  c_it.data(), p_it.data(), vit.data(), pain_points,
371  best_path_by_column, chunks_record,
372  best_choice_bundle, blamer_bundle);
373  }
374  } // done looking at parents for this c_it.data()
375  }
376  first_iteration = false;
377  }
378  return new_changed;
379 }
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
void GetTopChoiceLowerUpper(LanguageModelFlagsType changed, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper)
bool PrunablePath(LanguageModelFlagsType top_choice_flags, const LanguageModelDawgInfo *dawg_info)
const int GetMaxFixedLengthDawgIndex() const
Definition: dict.h:616
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
static const LanguageModelFlagsType kSmallestRatingFlag
LanguageModelFlagsType AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
int dimension() const
Definition: matrix.h:190
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
static const LanguageModelFlagsType kLowerCaseFlag
static const LanguageModelFlagsType kUpperCaseFlag
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
static const LanguageModelFlagsType kJustClassifiedFlag
int language_model_viterbi_list_max_num_prunable
MATRIX * ratings
Definition: associate.h:52
unsigned char LanguageModelFlagsType

Member Data Documentation

bool tesseract::LanguageModel::acceptable_choice_found_
protected

Definition at line 844 of file language_model.h.

DawgInfoVector* tesseract::LanguageModel::beginning_active_dawgs_
protected

Definition at line 827 of file language_model.h.

DawgInfoVector* tesseract::LanguageModel::beginning_constraints_
protected

Definition at line 828 of file language_model.h.

bool tesseract::LanguageModel::correct_segmentation_explored_
protected

Definition at line 846 of file language_model.h.

DawgArgs* tesseract::LanguageModel::dawg_args_
protected

Definition at line 786 of file language_model.h.

Dict* tesseract::LanguageModel::dict_
protected

Definition at line 807 of file language_model.h.

DawgInfoVector* tesseract::LanguageModel::empty_dawg_info_vec_
protected

Definition at line 830 of file language_model.h.

DawgInfoVector* tesseract::LanguageModel::fixed_length_beginning_active_dawgs_
protected

Definition at line 829 of file language_model.h.

bool tesseract::LanguageModel::fixed_pitch_
protected

Definition at line 814 of file language_model.h.

const UnicityTable<FontInfo>* tesseract::LanguageModel::fontinfo_table_
protected

Definition at line 803 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kAllChangedFlag = 0xff
static

Definition at line 293 of file language_model.h.

const float tesseract::LanguageModel::kBestChoicePainPointPriorityAdjustment = 0.5f
static

Definition at line 269 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kConsistentFlag = 0x8
static

Definition at line 289 of file language_model.h.

const float tesseract::LanguageModel::kCriticalPainPointPriorityAdjustment = 0.1f
static

Definition at line 270 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kDawgFlag = 0x10
static

Definition at line 290 of file language_model.h.

const float tesseract::LanguageModel::kDefaultPainPointPriorityAdjustment = 2.0f
static

Definition at line 268 of file language_model.h.

const float tesseract::LanguageModel::kInitialPainPointPriorityAdjustment = 5.0f
static

Definition at line 267 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kJustClassifiedFlag = 0x80
static

Definition at line 292 of file language_model.h.

const float tesseract::LanguageModel::kLooseMaxCharWhRatio = 2.5f
static

Definition at line 282 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kLowerCaseFlag = 0x2
static

Definition at line 287 of file language_model.h.

const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f
static

Definition at line 274 of file language_model.h.

const int tesseract::LanguageModel::kMinFixedLengthDawgLength = 2
static

Definition at line 278 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kNgramFlag = 0x20
static

Definition at line 291 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kSmallestRatingFlag = 0x1
static

Definition at line 286 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kUpperCaseFlag = 0x4
static

Definition at line 288 of file language_model.h.

int tesseract::LanguageModel::language_model_debug_level = 0

"Language model debug level"

Definition at line 738 of file language_model.h.

int tesseract::LanguageModel::language_model_fixed_length_choices_depth = 3

"Depth of blob choice lists to explore" " when fixed length dawgs are on"

Definition at line 766 of file language_model.h.

int tesseract::LanguageModel::language_model_min_compound_length = 3

"Minimum length of compound words"

Definition at line 763 of file language_model.h.

double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0

"Average classifier score of a non-matching unichar"

Definition at line 752 of file language_model.h.

bool tesseract::LanguageModel::language_model_ngram_on = false

"Turn on/off the use of character ngram model"

Definition at line 740 of file language_model.h.

int tesseract::LanguageModel::language_model_ngram_order = 8

"Maximum order of the character ngram model"

Definition at line 742 of file language_model.h.

double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03

"Strength of the character ngram model relative to the" " character classifier "

Definition at line 758 of file language_model.h.

double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001

"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"

Definition at line 750 of file language_model.h.

bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true

"Words are delimited by space"

Definition at line 760 of file language_model.h.

bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities"

Definition at line 755 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_case = 0.1

"Penalty for inconsistent case"

Definition at line 775 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_chartype = 0.3

"Penalty for inconsistent character type"

Definition at line 779 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_font = 0.00

"Penalty for inconsistent font"

Definition at line 781 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_increment = 0.01

"Penalty increment"

Definition at line 784 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15

"Penalty for non-dictionary words"

Definition at line 771 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1

"Penalty for words not in the frequent word dictionary"

Definition at line 769 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_punc = 0.2

"Penalty for inconsistent punctuation"

Definition at line 773 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_script = 0.5

"Penalty for inconsistent script"

Definition at line 777 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_spacing = 0.05

"Penalty for inconsistent spacing"

Definition at line 783 of file language_model.h.

bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false

"Use sigmoidal score for certainty"

Definition at line 786 of file language_model.h.

int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10

"Maximum number of prunable (those for which PrunablePath() is true)" "entries in each viterbi list recorded in BLOB_CHOICEs"

Definition at line 745 of file language_model.h.

int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500

"Maximum size of viterbi lists recorded in BLOB_CHOICEs"

Definition at line 747 of file language_model.h.

float tesseract::LanguageModel::max_char_wh_ratio_
protected

Definition at line 817 of file language_model.h.

float tesseract::LanguageModel::max_penalty_adjust_
protected

Definition at line 832 of file language_model.h.

STRING tesseract::LanguageModel::prev_word_str_
protected

Definition at line 824 of file language_model.h.

int tesseract::LanguageModel::prev_word_unichar_step_len_
protected

Definition at line 825 of file language_model.h.

float tesseract::LanguageModel::rating_cert_scale_
protected

Definition at line 798 of file language_model.h.

GenericVector<bool *> tesseract::LanguageModel::updated_flags_
protected

Definition at line 796 of file language_model.h.


The documentation for this class was generated from the following files: