39 STATE *output_best_state,
43 tprintf(
"Starting SegSearch on ratings matrix:\n");
72 for (col = 0; col < ratings->
dimension(); ++col) {
85 pain_points, chunks_record, blamer_bundle,
89 float pain_point_priority;
91 output_best_state, best_choice, raw_choice, best_char_choices);
102 SEG_SEARCH_PENDING_LIST *pending =
103 new SEG_SEARCH_PENDING_LIST[ratings->
dimension()];
106 for (row = 0; row < ratings->
dimension(); ++row) {
108 pending[0].add_sorted(
114 pain_points, &best_choice_bundle, blamer_bundle);
117 int num_futile_classifications = 0;
120 (blamer_bundle !=
NULL &&
121 blamer_bundle->segsearch_is_looking_for_blame)) {
125 pop =
HeapPop(pain_points, &pain_point_priority, &pain_point);
126 if (pop ==
EMPTY)
break;
127 if (pain_point->
Valid(*ratings) &&
140 chunks_record, pain_points, blamer_bundle);
143 chunks_record, pain_points, &best_choice_bundle,
145 if (!best_choice_bundle.
updated) ++num_futile_classifications;
148 tprintf(
"num_futile_classifications %d\n", num_futile_classifications);
151 best_choice_bundle.
updated =
false;
157 blamer_bundle->incorrect_result_reason ==
IRR_CORRECT &&
158 !blamer_bundle->segsearch_is_looking_for_blame &&
159 blamer_bundle->truth_has_char_boxes &&
161 best_choice, blamer_bundle->truth_text)) {
163 pain_points, blamer_bundle, &blamer_debug);
167 blamer_bundle, &blamer_debug);
170 tprintf(
"Done with SegSearch (AcceptableChoiceFound: %d)\n",
176 delete[] best_path_by_column;
178 for (row = 0; row < ratings->
dimension(); ++row) {
179 for (col = 0; col <= row; ++col) {
180 BLOB_CHOICE_LIST *rating = ratings->
get(col, row);
188 SEG_SEARCH_PENDING_LIST *pending[],
195 for (
int col = starting_col; col < ratings->
dimension(); ++col) {
197 tprintf(
"\n\nUpdateSegSearchNodes: evaluate children in col=%d\n", col);
200 SEG_SEARCH_PENDING_LIST *pending_list = &((*pending)[col]);
201 SEG_SEARCH_PENDING_IT pending_it(pending_list);
203 while (!pending_it.empty()) {
206 if (non_empty_rows.
length() == 0 ||
210 BLOB_CHOICE_LIST *current_node = ratings->
get(col, p->
child_row);
213 current_node, p->
parent, pain_points,
214 best_path_by_column, chunks_record,
215 best_choice_bundle, blamer_bundle);
221 for (
int child_row = child_col;
222 child_row < ratings->
dimension(); ++child_row) {
228 (*pending)[child_col].add_sorted_and_find(
230 if (new_pending != actual_new_pending)
delete new_pending;
231 actual_new_pending->
changed |= new_changed;
233 tprintf(
"Added child(col=%d row=%d) parent(col=%d row=%d)"
234 " changed=0x%x to pending\n", child_col,
242 pending_it.forward();
246 pain_points, best_path_by_column, chunks_record);
249 if (best_choice_bundle->
updated) {
251 pain_points, chunks_record, best_choice_bundle);
260 SEG_SEARCH_PENDING_LIST *pending[],
265 tprintf(
"Classifying pain point priority=%.4f, col=%d, row=%d\n",
266 pain_point_priority, pain_point.
col, pain_point.
row);
272 pain_point.
col, pain_point.
row, blamer_bundle);
273 ratings->
put(pain_point.
col, pain_point.
row, classified);
277 ratings->
get(pain_point.
col, pain_point.
row),
284 if (!classified->empty()) {
285 float worst_piece_cert;
287 if (pain_point.
col > 0) {
290 &worst_piece_cert, &fragmented);
292 pain_point.
col-1, pain_point.
row,
false,
294 worst_piece_cert, fragmented, best_choice->
certainty(),
296 chunks_record, pain_points);
301 &worst_piece_cert, &fragmented);
303 pain_point.
col, pain_point.
row+1,
true,
305 worst_piece_cert, fragmented, best_choice->
certainty(),
307 chunks_record, pain_points);
312 int parent_row = pain_point.
col - 1;
313 if (parent_row < 0) {
314 (*pending)[pain_point.
col].add_sorted(
319 for (
int parent_col = 0; parent_col < pain_point.
col; ++parent_col) {
321 (*pending)[pain_point.
col].add_sorted(
324 ratings->
get(parent_col, parent_row),
338 tprintf(
"segsearch starting to look for blame\n");
342 float pain_point_priority;
344 while ((pop =
HeapPop(pain_points, &pain_point_priority,
345 &pain_point)) !=
EMPTY) {
350 *blamer_debug +=
"Correct segmentation:\n";
357 *blamer_debug +=
"\n";
365 NULL,
NULL, chunks_record, pain_points)) {
367 *blamer_debug +=
"\nFailed to insert pain point\n";
393 *blamer_debug =
"Best choice is: incorrect, top choice, dictionary word";
394 *blamer_debug +=
" with permuter ";
400 *blamer_debug +=
"Correct segmentation state was not explored";
406 *blamer_debug +=
"Correct segmentation paths were pruned by LM\n";
408 char debug_buffer[256];
409 *blamer_debug +=
"Best correct segmentation rating ";
410 sprintf(debug_buffer,
"%g",
412 *blamer_debug += debug_buffer;
413 *blamer_debug +=
" vs. best choice rating ";
414 sprintf(debug_buffer,
"%g", best_choice->
rating());
415 *blamer_debug += debug_buffer;
bool ChoiceIsCorrect(const UNICHARSET &uni_set, const WERD_CHOICE *choice, const GenericVector< STRING > &truth_text)
ViterbiStateEntry * best_vse
void FinishBlamerForSegSearch(const WERD_CHOICE *best_choice, BlamerBundle *blamer_bundle, STRING *blamer_debug)
void set_rating(float new_val)
tesseract::LanguageModelFlagsType changed
GenericVector< int > correct_segmentation_cols
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const WERD_CHOICE *best_choice, SEG_SEARCH_PENDING_LIST *pending[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BlamerBundle *blamer_bundle)
void SegSearch(CHUNKS_RECORD *chunks_record, WERD_CHOICE *best_choice, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *raw_choice, STATE *output_best_state, BlamerBundle *blamer_bundle)
ELISTIZE(SEG_SEARCH_PENDING)
bool best_choice_is_dict_and_top_choice
double segsearch_max_char_wh_ratio
float best_correctly_segmented_rating
void GeneratePainPointsFromBestChoice(HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle)
HEAP * MakeHeap(int Size)
const UNICHARSET & getUnicharset() const
T get(int column, int row) const
void ClearBestChoiceAccum()
Clears best_choices_ list accumulated by the stopper.
bool SegSearchDone(int num_futile_classifications)
void FreeHeapData(HEAP *Heap, void_dest destructor)
void GeneratePainPointsFromColumn(int col, const GenericVector< int > &non_empty_rows, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record)
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float best_choice_cert, float max_char_wh_ratio, float rating_cert_scale, HEAP *pain_points, CHUNKS_RECORD *chunks_record, BlamerBundle *blamer_bundle, bool debug_blamer)
static const float kInitialPainPointPriorityAdjustment
static const LanguageModelFlagsType kAllChangedFlag
bool assume_fixed_pitch_char_segment
WERD_CHOICE * prev_word_best_choice_
void add_str_int(const char *str, int number)
int segsearch_max_pain_points
int HeapPop(HEAP *Heap, FLOAT32 *Key, void *out_ptr)
static int compare(const void *p1, const void *p2)
void put(int column, int row, const T &thing)
WERD_CHOICE * best_choice
bool AcceptableChoiceFound()
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET ¤t_unicharset)
void SetBlame(IncorrectResultReason irr, const STRING &msg, const WERD_CHOICE *choice, bool debug)
void InitBlamerForSegSearch(const WERD_CHOICE *best_choice, CHUNKS_RECORD *chunks_record, HEAP *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
static const float kBadRating
void DeleteState(BLOB_CHOICE_LIST *choices)
DLLSYM void tprintf(const char *format,...)
bool wordrec_debug_blamer
bool Valid(const MATRIX &m) const
void print(const UNICHARSET &unicharset) const
virtual BLOB_CHOICE_LIST * classify_piece(TBLOB *pieces, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle)
const char * permuter_name() const
LanguageModelFlagsType UpdateState(LanguageModelFlagsType changed, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE_LIST *parent_list, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
void GetWorstPieceCertainty(int col, int row, MATRIX *ratings, float *cert, bool *fragmented)
LanguageModel * language_model_
void UpdateSegSearchNodes(int starting_col, SEG_SEARCH_PENDING_LIST *pending[], BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
bool GeneratePainPoint(int col, int row, bool ok_to_extend, float priority_adjustment, float worst_piece_cert, bool fragmented, float best_choice_cert, float max_char_wh_ratio, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, HEAP *pain_points)
bool segsearch_is_looking_for_blame
int segsearch_debug_level
unsigned char LanguageModelFlagsType
GenericVector< int > correct_segmentation_rows
static void Delete(void *arg)
BLOB_CHOICE_LIST * parent